1#! /usr/bin/env python3
2# -*- coding: iso-8859-1 -*-
3# Originally written by Barry Warsaw <barry@python.org>
4#
5# Minimally patched to make it even more xgettext compatible
6# by Peter Funk <pf@artcom-gmbh.de>
7#
8# 2002-11-22 J�rgen Hermann <jh@web.de>
9# Added checks that _() only contains string literals, and
10# command line args are resolved to module lists, i.e. you
11# can now pass a filename, a module or package name, or a
12# directory (including globbing chars, important for Win32).
13# Made docstring fit in 80 chars wide displays using pydoc.
14#
15
16# for selftesting
17try:
18    import fintl
19    _ = fintl.gettext
20except ImportError:
21    _ = lambda s: s
22
23__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24
25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26internationalization of C programs. Most of these tools are independent of
27the programming language and can be used from within Python programs.
28Martin von Loewis' work[1] helps considerably in this regard.
29
30There's one problem though; xgettext is the program that scans source code
31looking for message strings, but it groks only C (or C++). Python
32introduces a few wrinkles, such as dual quoting characters, triple quoted
33strings, and raw strings. xgettext understands none of this.
34
35Enter pygettext, which uses Python's standard tokenize module to scan
36Python source code, generating .pot files identical to what GNU xgettext[2]
37generates for C and C++ code. From there, the standard GNU tools can be
38used.
39
40A word about marking Python strings as candidates for translation. GNU
41xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42and gettext_noop. But those can be a lot of text to include all over your
43code. C and C++ have a trick: they use the C preprocessor. Most
44internationalized C source includes a #define for gettext() to _() so that
45what has to be written in the source is much less. Thus these are both
46translatable strings:
47
48    gettext("Translatable String")
49    _("Translatable String")
50
51Python of course has no preprocessor so this doesn't work so well.  Thus,
52pygettext searches only for _() by default, but see the -k/--keyword flag
53below for how to augment this.
54
55 [1] https://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] https://www.gnu.org/software/gettext/gettext.html
57
58NOTE: pygettext attempts to be option and feature compatible with GNU
59xgettext where ever possible. However some options are still missing or are
60not fully implemented. Also, xgettext's use of command line switches with
61option arguments is broken, and in these cases, pygettext just defines
62additional switches.
63
64Usage: pygettext [options] inputfile ...
65
66Options:
67
68    -a
69    --extract-all
70        Extract all strings.
71
72    -d name
73    --default-domain=name
74        Rename the default output file from messages.pot to name.pot.
75
76    -E
77    --escape
78        Replace non-ASCII characters with octal escape sequences.
79
80    -D
81    --docstrings
82        Extract module, class, method, and function docstrings.  These do
83        not need to be wrapped in _() markers, and in fact cannot be for
84        Python to consider them docstrings. (See also the -X option).
85
86    -h
87    --help
88        Print this help message and exit.
89
90    -k word
91    --keyword=word
92        Keywords to look for in addition to the default set, which are:
93        %(DEFAULTKEYWORDS)s
94
95        You can have multiple -k flags on the command line.
96
97    -K
98    --no-default-keywords
99        Disable the default set of keywords (see above).  Any keywords
100        explicitly added with the -k/--keyword option are still recognized.
101
102    --no-location
103        Do not write filename/lineno location comments.
104
105    -n
106    --add-location
107        Write filename/lineno location comments indicating where each
108        extracted string is found in the source.  These lines appear before
109        each msgid.  The style of comments is controlled by the -S/--style
110        option.  This is the default.
111
112    -o filename
113    --output=filename
114        Rename the default output file from messages.pot to filename.  If
115        filename is `-' then the output is sent to standard out.
116
117    -p dir
118    --output-dir=dir
119        Output files will be placed in directory dir.
120
121    -S stylename
122    --style stylename
123        Specify which style to use for location comments.  Two styles are
124        supported:
125
126        Solaris  # File: filename, line: line-number
127        GNU      #: filename:line
128
129        The style name is case insensitive.  GNU style is the default.
130
131    -v
132    --verbose
133        Print the names of the files being processed.
134
135    -V
136    --version
137        Print the version of pygettext and exit.
138
139    -w columns
140    --width=columns
141        Set width of output to columns.
142
143    -x filename
144    --exclude-file=filename
145        Specify a file that contains a list of strings that are not be
146        extracted from the input files.  Each string to be excluded must
147        appear on a line by itself in the file.
148
149    -X filename
150    --no-docstrings=filename
151        Specify a file that contains a list of files (one per line) that
152        should not have their docstrings extracted.  This is only useful in
153        conjunction with the -D option above.
154
155If `inputfile' is -, standard input is read.
156""")
157
158import os
159import importlib.machinery
160import importlib.util
161import sys
162import glob
163import time
164import getopt
165import ast
166import token
167import tokenize
168
169__version__ = '1.5'
170
171default_keywords = ['_']
172DEFAULTKEYWORDS = ', '.join(default_keywords)
173
174EMPTYSTRING = ''
175
176
177
178# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
179# there.
180pot_header = _('''\
181# SOME DESCRIPTIVE TITLE.
182# Copyright (C) YEAR ORGANIZATION
183# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
184#
185msgid ""
186msgstr ""
187"Project-Id-Version: PACKAGE VERSION\\n"
188"POT-Creation-Date: %(time)s\\n"
189"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
190"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
191"Language-Team: LANGUAGE <LL@li.org>\\n"
192"MIME-Version: 1.0\\n"
193"Content-Type: text/plain; charset=%(charset)s\\n"
194"Content-Transfer-Encoding: %(encoding)s\\n"
195"Generated-By: pygettext.py %(version)s\\n"
196
197''')
198
199
200def usage(code, msg=''):
201    print(__doc__ % globals(), file=sys.stderr)
202    if msg:
203        print(msg, file=sys.stderr)
204    sys.exit(code)
205
206
207
208def make_escapes(pass_nonascii):
209    global escapes, escape
210    if pass_nonascii:
211        # Allow non-ascii characters to pass through so that e.g. 'msgid
212        # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
213        # escape any character outside the 32..126 range.
214        mod = 128
215        escape = escape_ascii
216    else:
217        mod = 256
218        escape = escape_nonascii
219    escapes = [r"\%03o" % i for i in range(mod)]
220    for i in range(32, 127):
221        escapes[i] = chr(i)
222    escapes[ord('\\')] = r'\\'
223    escapes[ord('\t')] = r'\t'
224    escapes[ord('\r')] = r'\r'
225    escapes[ord('\n')] = r'\n'
226    escapes[ord('\"')] = r'\"'
227
228
229def escape_ascii(s, encoding):
230    return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
231
232def escape_nonascii(s, encoding):
233    return ''.join(escapes[b] for b in s.encode(encoding))
234
235
236def is_literal_string(s):
237    return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"')
238
239
240def safe_eval(s):
241    # unwrap quotes, safely
242    return eval(s, {'__builtins__':{}}, {})
243
244
245def normalize(s, encoding):
246    # This converts the various Python string types into a format that is
247    # appropriate for .po files, namely much closer to C style.
248    lines = s.split('\n')
249    if len(lines) == 1:
250        s = '"' + escape(s, encoding) + '"'
251    else:
252        if not lines[-1]:
253            del lines[-1]
254            lines[-1] = lines[-1] + '\n'
255        for i in range(len(lines)):
256            lines[i] = escape(lines[i], encoding)
257        lineterm = '\\n"\n"'
258        s = '""\n"' + lineterm.join(lines) + '"'
259    return s
260
261
262def containsAny(str, set):
263    """Check whether 'str' contains ANY of the chars in 'set'"""
264    return 1 in [c in str for c in set]
265
266
267def getFilesForName(name):
268    """Get a list of module files for a filename, a module or package name,
269    or a directory.
270    """
271    if not os.path.exists(name):
272        # check for glob chars
273        if containsAny(name, "*?[]"):
274            files = glob.glob(name)
275            list = []
276            for file in files:
277                list.extend(getFilesForName(file))
278            return list
279
280        # try to find module or package
281        try:
282            spec = importlib.util.find_spec(name)
283            name = spec.origin
284        except ImportError:
285            name = None
286        if not name:
287            return []
288
289    if os.path.isdir(name):
290        # find all python files in directory
291        list = []
292        # get extension for python source files
293        _py_ext = importlib.machinery.SOURCE_SUFFIXES[0]
294        for root, dirs, files in os.walk(name):
295            # don't recurse into CVS directories
296            if 'CVS' in dirs:
297                dirs.remove('CVS')
298            # add all *.py files to list
299            list.extend(
300                [os.path.join(root, file) for file in files
301                 if os.path.splitext(file)[1] == _py_ext]
302                )
303        return list
304    elif os.path.exists(name):
305        # a single file
306        return [name]
307
308    return []
309
310
311class TokenEater:
312    def __init__(self, options):
313        self.__options = options
314        self.__messages = {}
315        self.__state = self.__waiting
316        self.__data = []
317        self.__lineno = -1
318        self.__freshmodule = 1
319        self.__curfile = None
320        self.__enclosurecount = 0
321
322    def __call__(self, ttype, tstring, stup, etup, line):
323        # dispatch
324##        import token
325##        print('ttype:', token.tok_name[ttype], 'tstring:', tstring,
326##              file=sys.stderr)
327        self.__state(ttype, tstring, stup[0])
328
329    def __waiting(self, ttype, tstring, lineno):
330        opts = self.__options
331        # Do docstring extractions, if enabled
332        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
333            # module docstring?
334            if self.__freshmodule:
335                if ttype == tokenize.STRING and is_literal_string(tstring):
336                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
337                    self.__freshmodule = 0
338                    return
339                if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING):
340                    return
341                self.__freshmodule = 0
342            # class or func/method docstring?
343            if ttype == tokenize.NAME and tstring in ('class', 'def'):
344                self.__state = self.__suiteseen
345                return
346        if ttype == tokenize.NAME and tstring in opts.keywords:
347            self.__state = self.__keywordseen
348            return
349        if ttype == tokenize.STRING:
350            maybe_fstring = ast.parse(tstring, mode='eval').body
351            if not isinstance(maybe_fstring, ast.JoinedStr):
352                return
353            for value in filter(lambda node: isinstance(node, ast.FormattedValue),
354                                maybe_fstring.values):
355                for call in filter(lambda node: isinstance(node, ast.Call),
356                                   ast.walk(value)):
357                    func = call.func
358                    if isinstance(func, ast.Name):
359                        func_name = func.id
360                    elif isinstance(func, ast.Attribute):
361                        func_name = func.attr
362                    else:
363                        continue
364
365                    if func_name not in opts.keywords:
366                        continue
367                    if len(call.args) != 1:
368                        print(_(
369                            '*** %(file)s:%(lineno)s: Seen unexpected amount of'
370                            ' positional arguments in gettext call: %(source_segment)s'
371                            ) % {
372                            'source_segment': ast.get_source_segment(tstring, call) or tstring,
373                            'file': self.__curfile,
374                            'lineno': lineno
375                            }, file=sys.stderr)
376                        continue
377                    if call.keywords:
378                        print(_(
379                            '*** %(file)s:%(lineno)s: Seen unexpected keyword arguments'
380                            ' in gettext call: %(source_segment)s'
381                            ) % {
382                            'source_segment': ast.get_source_segment(tstring, call) or tstring,
383                            'file': self.__curfile,
384                            'lineno': lineno
385                            }, file=sys.stderr)
386                        continue
387                    arg = call.args[0]
388                    if not isinstance(arg, ast.Constant):
389                        print(_(
390                            '*** %(file)s:%(lineno)s: Seen unexpected argument type'
391                            ' in gettext call: %(source_segment)s'
392                            ) % {
393                            'source_segment': ast.get_source_segment(tstring, call) or tstring,
394                            'file': self.__curfile,
395                            'lineno': lineno
396                            }, file=sys.stderr)
397                        continue
398                    if isinstance(arg.value, str):
399                        self.__addentry(arg.value, lineno)
400
401    def __suiteseen(self, ttype, tstring, lineno):
402        # skip over any enclosure pairs until we see the colon
403        if ttype == tokenize.OP:
404            if tstring == ':' and self.__enclosurecount == 0:
405                # we see a colon and we're not in an enclosure: end of def
406                self.__state = self.__suitedocstring
407            elif tstring in '([{':
408                self.__enclosurecount += 1
409            elif tstring in ')]}':
410                self.__enclosurecount -= 1
411
412    def __suitedocstring(self, ttype, tstring, lineno):
413        # ignore any intervening noise
414        if ttype == tokenize.STRING and is_literal_string(tstring):
415            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
416            self.__state = self.__waiting
417        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
418                           tokenize.COMMENT):
419            # there was no class docstring
420            self.__state = self.__waiting
421
422    def __keywordseen(self, ttype, tstring, lineno):
423        if ttype == tokenize.OP and tstring == '(':
424            self.__data = []
425            self.__lineno = lineno
426            self.__state = self.__openseen
427        else:
428            self.__state = self.__waiting
429
430    def __openseen(self, ttype, tstring, lineno):
431        if ttype == tokenize.OP and tstring == ')':
432            # We've seen the last of the translatable strings.  Record the
433            # line number of the first line of the strings and update the list
434            # of messages seen.  Reset state for the next batch.  If there
435            # were no strings inside _(), then just ignore this entry.
436            if self.__data:
437                self.__addentry(EMPTYSTRING.join(self.__data))
438            self.__state = self.__waiting
439        elif ttype == tokenize.STRING and is_literal_string(tstring):
440            self.__data.append(safe_eval(tstring))
441        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
442                           token.NEWLINE, tokenize.NL]:
443            # warn if we see anything else than STRING or whitespace
444            print(_(
445                '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
446                ) % {
447                'token': tstring,
448                'file': self.__curfile,
449                'lineno': self.__lineno
450                }, file=sys.stderr)
451            self.__state = self.__waiting
452
453    def __addentry(self, msg, lineno=None, isdocstring=0):
454        if lineno is None:
455            lineno = self.__lineno
456        if not msg in self.__options.toexclude:
457            entry = (self.__curfile, lineno)
458            self.__messages.setdefault(msg, {})[entry] = isdocstring
459
460    def set_filename(self, filename):
461        self.__curfile = filename
462        self.__freshmodule = 1
463
464    def write(self, fp):
465        options = self.__options
466        timestamp = time.strftime('%Y-%m-%d %H:%M%z')
467        encoding = fp.encoding if fp.encoding else 'UTF-8'
468        print(pot_header % {'time': timestamp, 'version': __version__,
469                            'charset': encoding,
470                            'encoding': '8bit'}, file=fp)
471        # Sort the entries.  First sort each particular entry's keys, then
472        # sort all the entries by their first item.
473        reverse = {}
474        for k, v in self.__messages.items():
475            keys = sorted(v.keys())
476            reverse.setdefault(tuple(keys), []).append((k, v))
477        rkeys = sorted(reverse.keys())
478        for rkey in rkeys:
479            rentries = reverse[rkey]
480            rentries.sort()
481            for k, v in rentries:
482                # If the entry was gleaned out of a docstring, then add a
483                # comment stating so.  This is to aid translators who may wish
484                # to skip translating some unimportant docstrings.
485                isdocstring = any(v.values())
486                # k is the message string, v is a dictionary-set of (filename,
487                # lineno) tuples.  We want to sort the entries in v first by
488                # file name and then by line number.
489                v = sorted(v.keys())
490                if not options.writelocations:
491                    pass
492                # location comments are different b/w Solaris and GNU:
493                elif options.locationstyle == options.SOLARIS:
494                    for filename, lineno in v:
495                        d = {'filename': filename, 'lineno': lineno}
496                        print(_(
497                            '# File: %(filename)s, line: %(lineno)d') % d, file=fp)
498                elif options.locationstyle == options.GNU:
499                    # fit as many locations on one line, as long as the
500                    # resulting line length doesn't exceed 'options.width'
501                    locline = '#:'
502                    for filename, lineno in v:
503                        d = {'filename': filename, 'lineno': lineno}
504                        s = _(' %(filename)s:%(lineno)d') % d
505                        if len(locline) + len(s) <= options.width:
506                            locline = locline + s
507                        else:
508                            print(locline, file=fp)
509                            locline = "#:" + s
510                    if len(locline) > 2:
511                        print(locline, file=fp)
512                if isdocstring:
513                    print('#, docstring', file=fp)
514                print('msgid', normalize(k, encoding), file=fp)
515                print('msgstr ""\n', file=fp)
516
517
518
519def main():
520    global default_keywords
521    try:
522        opts, args = getopt.getopt(
523            sys.argv[1:],
524            'ad:DEhk:Kno:p:S:Vvw:x:X:',
525            ['extract-all', 'default-domain=', 'escape', 'help',
526             'keyword=', 'no-default-keywords',
527             'add-location', 'no-location', 'output=', 'output-dir=',
528             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
529             'docstrings', 'no-docstrings',
530             ])
531    except getopt.error as msg:
532        usage(1, msg)
533
534    # for holding option values
535    class Options:
536        # constants
537        GNU = 1
538        SOLARIS = 2
539        # defaults
540        extractall = 0 # FIXME: currently this option has no effect at all.
541        escape = 0
542        keywords = []
543        outpath = ''
544        outfile = 'messages.pot'
545        writelocations = 1
546        locationstyle = GNU
547        verbose = 0
548        width = 78
549        excludefilename = ''
550        docstrings = 0
551        nodocstrings = {}
552
553    options = Options()
554    locations = {'gnu' : options.GNU,
555                 'solaris' : options.SOLARIS,
556                 }
557
558    # parse options
559    for opt, arg in opts:
560        if opt in ('-h', '--help'):
561            usage(0)
562        elif opt in ('-a', '--extract-all'):
563            options.extractall = 1
564        elif opt in ('-d', '--default-domain'):
565            options.outfile = arg + '.pot'
566        elif opt in ('-E', '--escape'):
567            options.escape = 1
568        elif opt in ('-D', '--docstrings'):
569            options.docstrings = 1
570        elif opt in ('-k', '--keyword'):
571            options.keywords.append(arg)
572        elif opt in ('-K', '--no-default-keywords'):
573            default_keywords = []
574        elif opt in ('-n', '--add-location'):
575            options.writelocations = 1
576        elif opt in ('--no-location',):
577            options.writelocations = 0
578        elif opt in ('-S', '--style'):
579            options.locationstyle = locations.get(arg.lower())
580            if options.locationstyle is None:
581                usage(1, _('Invalid value for --style: %s') % arg)
582        elif opt in ('-o', '--output'):
583            options.outfile = arg
584        elif opt in ('-p', '--output-dir'):
585            options.outpath = arg
586        elif opt in ('-v', '--verbose'):
587            options.verbose = 1
588        elif opt in ('-V', '--version'):
589            print(_('pygettext.py (xgettext for Python) %s') % __version__)
590            sys.exit(0)
591        elif opt in ('-w', '--width'):
592            try:
593                options.width = int(arg)
594            except ValueError:
595                usage(1, _('--width argument must be an integer: %s') % arg)
596        elif opt in ('-x', '--exclude-file'):
597            options.excludefilename = arg
598        elif opt in ('-X', '--no-docstrings'):
599            fp = open(arg)
600            try:
601                while 1:
602                    line = fp.readline()
603                    if not line:
604                        break
605                    options.nodocstrings[line[:-1]] = 1
606            finally:
607                fp.close()
608
609    # calculate escapes
610    make_escapes(not options.escape)
611
612    # calculate all keywords
613    options.keywords.extend(default_keywords)
614
615    # initialize list of strings to exclude
616    if options.excludefilename:
617        try:
618            with open(options.excludefilename) as fp:
619                options.toexclude = fp.readlines()
620        except IOError:
621            print(_(
622                "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr)
623            sys.exit(1)
624    else:
625        options.toexclude = []
626
627    # resolve args to module lists
628    expanded = []
629    for arg in args:
630        if arg == '-':
631            expanded.append(arg)
632        else:
633            expanded.extend(getFilesForName(arg))
634    args = expanded
635
636    # slurp through all the files
637    eater = TokenEater(options)
638    for filename in args:
639        if filename == '-':
640            if options.verbose:
641                print(_('Reading standard input'))
642            fp = sys.stdin.buffer
643            closep = 0
644        else:
645            if options.verbose:
646                print(_('Working on %s') % filename)
647            fp = open(filename, 'rb')
648            closep = 1
649        try:
650            eater.set_filename(filename)
651            try:
652                tokens = tokenize.tokenize(fp.readline)
653                for _token in tokens:
654                    eater(*_token)
655            except tokenize.TokenError as e:
656                print('%s: %s, line %d, column %d' % (
657                    e.args[0], filename, e.args[1][0], e.args[1][1]),
658                    file=sys.stderr)
659        finally:
660            if closep:
661                fp.close()
662
663    # write the output
664    if options.outfile == '-':
665        fp = sys.stdout
666        closep = 0
667    else:
668        if options.outpath:
669            options.outfile = os.path.join(options.outpath, options.outfile)
670        fp = open(options.outfile, 'w')
671        closep = 1
672    try:
673        eater.write(fp)
674    finally:
675        if closep:
676            fp.close()
677
678
679if __name__ == '__main__':
680    main()
681    # some more test strings
682    # this one creates a warning
683    _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
684    _('more' 'than' 'one' 'string')
685