1#! /usr/bin/env python3 2# -*- coding: iso-8859-1 -*- 3# Originally written by Barry Warsaw <barry@python.org> 4# 5# Minimally patched to make it even more xgettext compatible 6# by Peter Funk <pf@artcom-gmbh.de> 7# 8# 2002-11-22 J�rgen Hermann <jh@web.de> 9# Added checks that _() only contains string literals, and 10# command line args are resolved to module lists, i.e. you 11# can now pass a filename, a module or package name, or a 12# directory (including globbing chars, important for Win32). 13# Made docstring fit in 80 chars wide displays using pydoc. 14# 15 16# for selftesting 17try: 18 import fintl 19 _ = fintl.gettext 20except ImportError: 21 _ = lambda s: s 22 23__doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26internationalization of C programs. Most of these tools are independent of 27the programming language and can be used from within Python programs. 28Martin von Loewis' work[1] helps considerably in this regard. 29 30There's one problem though; xgettext is the program that scans source code 31looking for message strings, but it groks only C (or C++). Python 32introduces a few wrinkles, such as dual quoting characters, triple quoted 33strings, and raw strings. xgettext understands none of this. 34 35Enter pygettext, which uses Python's standard tokenize module to scan 36Python source code, generating .pot files identical to what GNU xgettext[2] 37generates for C and C++ code. From there, the standard GNU tools can be 38used. 39 40A word about marking Python strings as candidates for translation. GNU 41xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42and gettext_noop. But those can be a lot of text to include all over your 43code. C and C++ have a trick: they use the C preprocessor. Most 44internationalized C source includes a #define for gettext() to _() so that 45what has to be written in the source is much less. Thus these are both 46translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51Python of course has no preprocessor so this doesn't work so well. Thus, 52pygettext searches only for _() by default, but see the -k/--keyword flag 53below for how to augment this. 54 55 [1] https://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] https://www.gnu.org/software/gettext/gettext.html 57 58NOTE: pygettext attempts to be option and feature compatible with GNU 59xgettext where ever possible. However some options are still missing or are 60not fully implemented. Also, xgettext's use of command line switches with 61option arguments is broken, and in these cases, pygettext just defines 62additional switches. 63 64Usage: pygettext [options] inputfile ... 65 66Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155If `inputfile' is -, standard input is read. 156""") 157 158import os 159import importlib.machinery 160import importlib.util 161import sys 162import glob 163import time 164import getopt 165import ast 166import token 167import tokenize 168 169__version__ = '1.5' 170 171default_keywords = ['_'] 172DEFAULTKEYWORDS = ', '.join(default_keywords) 173 174EMPTYSTRING = '' 175 176 177 178# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 179# there. 180pot_header = _('''\ 181# SOME DESCRIPTIVE TITLE. 182# Copyright (C) YEAR ORGANIZATION 183# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 184# 185msgid "" 186msgstr "" 187"Project-Id-Version: PACKAGE VERSION\\n" 188"POT-Creation-Date: %(time)s\\n" 189"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 190"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 191"Language-Team: LANGUAGE <LL@li.org>\\n" 192"MIME-Version: 1.0\\n" 193"Content-Type: text/plain; charset=%(charset)s\\n" 194"Content-Transfer-Encoding: %(encoding)s\\n" 195"Generated-By: pygettext.py %(version)s\\n" 196 197''') 198 199 200def usage(code, msg=''): 201 print(__doc__ % globals(), file=sys.stderr) 202 if msg: 203 print(msg, file=sys.stderr) 204 sys.exit(code) 205 206 207 208def make_escapes(pass_nonascii): 209 global escapes, escape 210 if pass_nonascii: 211 # Allow non-ascii characters to pass through so that e.g. 'msgid 212 # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we 213 # escape any character outside the 32..126 range. 214 mod = 128 215 escape = escape_ascii 216 else: 217 mod = 256 218 escape = escape_nonascii 219 escapes = [r"\%03o" % i for i in range(mod)] 220 for i in range(32, 127): 221 escapes[i] = chr(i) 222 escapes[ord('\\')] = r'\\' 223 escapes[ord('\t')] = r'\t' 224 escapes[ord('\r')] = r'\r' 225 escapes[ord('\n')] = r'\n' 226 escapes[ord('\"')] = r'\"' 227 228 229def escape_ascii(s, encoding): 230 return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) 231 232def escape_nonascii(s, encoding): 233 return ''.join(escapes[b] for b in s.encode(encoding)) 234 235 236def is_literal_string(s): 237 return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"') 238 239 240def safe_eval(s): 241 # unwrap quotes, safely 242 return eval(s, {'__builtins__':{}}, {}) 243 244 245def normalize(s, encoding): 246 # This converts the various Python string types into a format that is 247 # appropriate for .po files, namely much closer to C style. 248 lines = s.split('\n') 249 if len(lines) == 1: 250 s = '"' + escape(s, encoding) + '"' 251 else: 252 if not lines[-1]: 253 del lines[-1] 254 lines[-1] = lines[-1] + '\n' 255 for i in range(len(lines)): 256 lines[i] = escape(lines[i], encoding) 257 lineterm = '\\n"\n"' 258 s = '""\n"' + lineterm.join(lines) + '"' 259 return s 260 261 262def containsAny(str, set): 263 """Check whether 'str' contains ANY of the chars in 'set'""" 264 return 1 in [c in str for c in set] 265 266 267def getFilesForName(name): 268 """Get a list of module files for a filename, a module or package name, 269 or a directory. 270 """ 271 if not os.path.exists(name): 272 # check for glob chars 273 if containsAny(name, "*?[]"): 274 files = glob.glob(name) 275 list = [] 276 for file in files: 277 list.extend(getFilesForName(file)) 278 return list 279 280 # try to find module or package 281 try: 282 spec = importlib.util.find_spec(name) 283 name = spec.origin 284 except ImportError: 285 name = None 286 if not name: 287 return [] 288 289 if os.path.isdir(name): 290 # find all python files in directory 291 list = [] 292 # get extension for python source files 293 _py_ext = importlib.machinery.SOURCE_SUFFIXES[0] 294 for root, dirs, files in os.walk(name): 295 # don't recurse into CVS directories 296 if 'CVS' in dirs: 297 dirs.remove('CVS') 298 # add all *.py files to list 299 list.extend( 300 [os.path.join(root, file) for file in files 301 if os.path.splitext(file)[1] == _py_ext] 302 ) 303 return list 304 elif os.path.exists(name): 305 # a single file 306 return [name] 307 308 return [] 309 310 311class TokenEater: 312 def __init__(self, options): 313 self.__options = options 314 self.__messages = {} 315 self.__state = self.__waiting 316 self.__data = [] 317 self.__lineno = -1 318 self.__freshmodule = 1 319 self.__curfile = None 320 self.__enclosurecount = 0 321 322 def __call__(self, ttype, tstring, stup, etup, line): 323 # dispatch 324## import token 325## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, 326## file=sys.stderr) 327 self.__state(ttype, tstring, stup[0]) 328 329 def __waiting(self, ttype, tstring, lineno): 330 opts = self.__options 331 # Do docstring extractions, if enabled 332 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 333 # module docstring? 334 if self.__freshmodule: 335 if ttype == tokenize.STRING and is_literal_string(tstring): 336 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 337 self.__freshmodule = 0 338 return 339 if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING): 340 return 341 self.__freshmodule = 0 342 # class or func/method docstring? 343 if ttype == tokenize.NAME and tstring in ('class', 'def'): 344 self.__state = self.__suiteseen 345 return 346 if ttype == tokenize.NAME and tstring in opts.keywords: 347 self.__state = self.__keywordseen 348 return 349 if ttype == tokenize.STRING: 350 maybe_fstring = ast.parse(tstring, mode='eval').body 351 if not isinstance(maybe_fstring, ast.JoinedStr): 352 return 353 for value in filter(lambda node: isinstance(node, ast.FormattedValue), 354 maybe_fstring.values): 355 for call in filter(lambda node: isinstance(node, ast.Call), 356 ast.walk(value)): 357 func = call.func 358 if isinstance(func, ast.Name): 359 func_name = func.id 360 elif isinstance(func, ast.Attribute): 361 func_name = func.attr 362 else: 363 continue 364 365 if func_name not in opts.keywords: 366 continue 367 if len(call.args) != 1: 368 print(_( 369 '*** %(file)s:%(lineno)s: Seen unexpected amount of' 370 ' positional arguments in gettext call: %(source_segment)s' 371 ) % { 372 'source_segment': ast.get_source_segment(tstring, call) or tstring, 373 'file': self.__curfile, 374 'lineno': lineno 375 }, file=sys.stderr) 376 continue 377 if call.keywords: 378 print(_( 379 '*** %(file)s:%(lineno)s: Seen unexpected keyword arguments' 380 ' in gettext call: %(source_segment)s' 381 ) % { 382 'source_segment': ast.get_source_segment(tstring, call) or tstring, 383 'file': self.__curfile, 384 'lineno': lineno 385 }, file=sys.stderr) 386 continue 387 arg = call.args[0] 388 if not isinstance(arg, ast.Constant): 389 print(_( 390 '*** %(file)s:%(lineno)s: Seen unexpected argument type' 391 ' in gettext call: %(source_segment)s' 392 ) % { 393 'source_segment': ast.get_source_segment(tstring, call) or tstring, 394 'file': self.__curfile, 395 'lineno': lineno 396 }, file=sys.stderr) 397 continue 398 if isinstance(arg.value, str): 399 self.__addentry(arg.value, lineno) 400 401 def __suiteseen(self, ttype, tstring, lineno): 402 # skip over any enclosure pairs until we see the colon 403 if ttype == tokenize.OP: 404 if tstring == ':' and self.__enclosurecount == 0: 405 # we see a colon and we're not in an enclosure: end of def 406 self.__state = self.__suitedocstring 407 elif tstring in '([{': 408 self.__enclosurecount += 1 409 elif tstring in ')]}': 410 self.__enclosurecount -= 1 411 412 def __suitedocstring(self, ttype, tstring, lineno): 413 # ignore any intervening noise 414 if ttype == tokenize.STRING and is_literal_string(tstring): 415 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 416 self.__state = self.__waiting 417 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 418 tokenize.COMMENT): 419 # there was no class docstring 420 self.__state = self.__waiting 421 422 def __keywordseen(self, ttype, tstring, lineno): 423 if ttype == tokenize.OP and tstring == '(': 424 self.__data = [] 425 self.__lineno = lineno 426 self.__state = self.__openseen 427 else: 428 self.__state = self.__waiting 429 430 def __openseen(self, ttype, tstring, lineno): 431 if ttype == tokenize.OP and tstring == ')': 432 # We've seen the last of the translatable strings. Record the 433 # line number of the first line of the strings and update the list 434 # of messages seen. Reset state for the next batch. If there 435 # were no strings inside _(), then just ignore this entry. 436 if self.__data: 437 self.__addentry(EMPTYSTRING.join(self.__data)) 438 self.__state = self.__waiting 439 elif ttype == tokenize.STRING and is_literal_string(tstring): 440 self.__data.append(safe_eval(tstring)) 441 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 442 token.NEWLINE, tokenize.NL]: 443 # warn if we see anything else than STRING or whitespace 444 print(_( 445 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 446 ) % { 447 'token': tstring, 448 'file': self.__curfile, 449 'lineno': self.__lineno 450 }, file=sys.stderr) 451 self.__state = self.__waiting 452 453 def __addentry(self, msg, lineno=None, isdocstring=0): 454 if lineno is None: 455 lineno = self.__lineno 456 if not msg in self.__options.toexclude: 457 entry = (self.__curfile, lineno) 458 self.__messages.setdefault(msg, {})[entry] = isdocstring 459 460 def set_filename(self, filename): 461 self.__curfile = filename 462 self.__freshmodule = 1 463 464 def write(self, fp): 465 options = self.__options 466 timestamp = time.strftime('%Y-%m-%d %H:%M%z') 467 encoding = fp.encoding if fp.encoding else 'UTF-8' 468 print(pot_header % {'time': timestamp, 'version': __version__, 469 'charset': encoding, 470 'encoding': '8bit'}, file=fp) 471 # Sort the entries. First sort each particular entry's keys, then 472 # sort all the entries by their first item. 473 reverse = {} 474 for k, v in self.__messages.items(): 475 keys = sorted(v.keys()) 476 reverse.setdefault(tuple(keys), []).append((k, v)) 477 rkeys = sorted(reverse.keys()) 478 for rkey in rkeys: 479 rentries = reverse[rkey] 480 rentries.sort() 481 for k, v in rentries: 482 # If the entry was gleaned out of a docstring, then add a 483 # comment stating so. This is to aid translators who may wish 484 # to skip translating some unimportant docstrings. 485 isdocstring = any(v.values()) 486 # k is the message string, v is a dictionary-set of (filename, 487 # lineno) tuples. We want to sort the entries in v first by 488 # file name and then by line number. 489 v = sorted(v.keys()) 490 if not options.writelocations: 491 pass 492 # location comments are different b/w Solaris and GNU: 493 elif options.locationstyle == options.SOLARIS: 494 for filename, lineno in v: 495 d = {'filename': filename, 'lineno': lineno} 496 print(_( 497 '# File: %(filename)s, line: %(lineno)d') % d, file=fp) 498 elif options.locationstyle == options.GNU: 499 # fit as many locations on one line, as long as the 500 # resulting line length doesn't exceed 'options.width' 501 locline = '#:' 502 for filename, lineno in v: 503 d = {'filename': filename, 'lineno': lineno} 504 s = _(' %(filename)s:%(lineno)d') % d 505 if len(locline) + len(s) <= options.width: 506 locline = locline + s 507 else: 508 print(locline, file=fp) 509 locline = "#:" + s 510 if len(locline) > 2: 511 print(locline, file=fp) 512 if isdocstring: 513 print('#, docstring', file=fp) 514 print('msgid', normalize(k, encoding), file=fp) 515 print('msgstr ""\n', file=fp) 516 517 518 519def main(): 520 global default_keywords 521 try: 522 opts, args = getopt.getopt( 523 sys.argv[1:], 524 'ad:DEhk:Kno:p:S:Vvw:x:X:', 525 ['extract-all', 'default-domain=', 'escape', 'help', 526 'keyword=', 'no-default-keywords', 527 'add-location', 'no-location', 'output=', 'output-dir=', 528 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 529 'docstrings', 'no-docstrings', 530 ]) 531 except getopt.error as msg: 532 usage(1, msg) 533 534 # for holding option values 535 class Options: 536 # constants 537 GNU = 1 538 SOLARIS = 2 539 # defaults 540 extractall = 0 # FIXME: currently this option has no effect at all. 541 escape = 0 542 keywords = [] 543 outpath = '' 544 outfile = 'messages.pot' 545 writelocations = 1 546 locationstyle = GNU 547 verbose = 0 548 width = 78 549 excludefilename = '' 550 docstrings = 0 551 nodocstrings = {} 552 553 options = Options() 554 locations = {'gnu' : options.GNU, 555 'solaris' : options.SOLARIS, 556 } 557 558 # parse options 559 for opt, arg in opts: 560 if opt in ('-h', '--help'): 561 usage(0) 562 elif opt in ('-a', '--extract-all'): 563 options.extractall = 1 564 elif opt in ('-d', '--default-domain'): 565 options.outfile = arg + '.pot' 566 elif opt in ('-E', '--escape'): 567 options.escape = 1 568 elif opt in ('-D', '--docstrings'): 569 options.docstrings = 1 570 elif opt in ('-k', '--keyword'): 571 options.keywords.append(arg) 572 elif opt in ('-K', '--no-default-keywords'): 573 default_keywords = [] 574 elif opt in ('-n', '--add-location'): 575 options.writelocations = 1 576 elif opt in ('--no-location',): 577 options.writelocations = 0 578 elif opt in ('-S', '--style'): 579 options.locationstyle = locations.get(arg.lower()) 580 if options.locationstyle is None: 581 usage(1, _('Invalid value for --style: %s') % arg) 582 elif opt in ('-o', '--output'): 583 options.outfile = arg 584 elif opt in ('-p', '--output-dir'): 585 options.outpath = arg 586 elif opt in ('-v', '--verbose'): 587 options.verbose = 1 588 elif opt in ('-V', '--version'): 589 print(_('pygettext.py (xgettext for Python) %s') % __version__) 590 sys.exit(0) 591 elif opt in ('-w', '--width'): 592 try: 593 options.width = int(arg) 594 except ValueError: 595 usage(1, _('--width argument must be an integer: %s') % arg) 596 elif opt in ('-x', '--exclude-file'): 597 options.excludefilename = arg 598 elif opt in ('-X', '--no-docstrings'): 599 fp = open(arg) 600 try: 601 while 1: 602 line = fp.readline() 603 if not line: 604 break 605 options.nodocstrings[line[:-1]] = 1 606 finally: 607 fp.close() 608 609 # calculate escapes 610 make_escapes(not options.escape) 611 612 # calculate all keywords 613 options.keywords.extend(default_keywords) 614 615 # initialize list of strings to exclude 616 if options.excludefilename: 617 try: 618 with open(options.excludefilename) as fp: 619 options.toexclude = fp.readlines() 620 except IOError: 621 print(_( 622 "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) 623 sys.exit(1) 624 else: 625 options.toexclude = [] 626 627 # resolve args to module lists 628 expanded = [] 629 for arg in args: 630 if arg == '-': 631 expanded.append(arg) 632 else: 633 expanded.extend(getFilesForName(arg)) 634 args = expanded 635 636 # slurp through all the files 637 eater = TokenEater(options) 638 for filename in args: 639 if filename == '-': 640 if options.verbose: 641 print(_('Reading standard input')) 642 fp = sys.stdin.buffer 643 closep = 0 644 else: 645 if options.verbose: 646 print(_('Working on %s') % filename) 647 fp = open(filename, 'rb') 648 closep = 1 649 try: 650 eater.set_filename(filename) 651 try: 652 tokens = tokenize.tokenize(fp.readline) 653 for _token in tokens: 654 eater(*_token) 655 except tokenize.TokenError as e: 656 print('%s: %s, line %d, column %d' % ( 657 e.args[0], filename, e.args[1][0], e.args[1][1]), 658 file=sys.stderr) 659 finally: 660 if closep: 661 fp.close() 662 663 # write the output 664 if options.outfile == '-': 665 fp = sys.stdout 666 closep = 0 667 else: 668 if options.outpath: 669 options.outfile = os.path.join(options.outpath, options.outfile) 670 fp = open(options.outfile, 'w') 671 closep = 1 672 try: 673 eater.write(fp) 674 finally: 675 if closep: 676 fp.close() 677 678 679if __name__ == '__main__': 680 main() 681 # some more test strings 682 # this one creates a warning 683 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 684 _('more' 'than' 'one' 'string') 685