aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2016-11-08 21:17:46 +0200
committerSerhiy Storchaka <storchaka@gmail.com>2016-11-08 21:17:46 +0200
commit07bcf05fcf3fd1d4001e8e3489162e6d67638285 (patch)
treeee55a562d4ac5d1ff722e3ac13a750e762ed71a9 /Lib/gettext.py
parentIssue #26171: Prevent buffer overflow in get_data (diff)
downloadcpython-07bcf05fcf3fd1d4001e8e3489162e6d67638285.tar.gz
cpython-07bcf05fcf3fd1d4001e8e3489162e6d67638285.tar.bz2
cpython-07bcf05fcf3fd1d4001e8e3489162e6d67638285.zip
Issue #28563: Fixed possible DoS and arbitrary code execution when handle
plural form selections in the gettext module. The expression parser now supports exact syntax supported by GNU gettext.
Diffstat (limited to 'Lib/gettext.py')
-rw-r--r--Lib/gettext.py172
1 files changed, 128 insertions, 44 deletions
diff --git a/Lib/gettext.py b/Lib/gettext.py
index e43f044cc71..1dadbc7a64c 100644
--- a/Lib/gettext.py
+++ b/Lib/gettext.py
@@ -57,55 +57,139 @@ __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
_default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
+# Expression parsing for plural form selection.
+#
+# The gettext library supports a small subset of C syntax. The only
+# incompatible difference is that integer literals starting with zero are
+# decimal.
+#
+# https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms
+# http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y
+
+_token_pattern = re.compile(r"""
+ (?P<WHITESPACES>[ \t]+) | # spaces and horizontal tabs
+ (?P<NUMBER>[0-9]+\b) | # decimal integer
+ (?P<NAME>n\b) | # only n is allowed
+ (?P<PARENTHESIS>[()]) |
+ (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >,
+ # <=, >=, ==, !=, &&, ||,
+ # ? :
+ # unary and bitwise ops
+ # not allowed
+ (?P<INVALID>\w+|.) # invalid token
+ """, re.VERBOSE|re.DOTALL)
+
+def _tokenize(plural):
+ for mo in re.finditer(_token_pattern, plural):
+ kind = mo.lastgroup
+ if kind == 'WHITESPACES':
+ continue
+ value = mo.group(kind)
+ if kind == 'INVALID':
+ raise ValueError('invalid token in plural form: %s' % value)
+ yield value
+ yield ''
+
+def _error(value):
+ if value:
+ return ValueError('unexpected token in plural form: %s' % value)
+ else:
+ return ValueError('unexpected end of plural form')
+
+_binary_ops = (
+ ('||',),
+ ('&&',),
+ ('==', '!='),
+ ('<', '>', '<=', '>='),
+ ('+', '-'),
+ ('*', '/', '%'),
+)
+_binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops}
+_c2py_ops = {'||': 'or', '&&': 'and', '/': '//'}
+
+def _parse(tokens, priority=-1):
+ result = ''
+ nexttok = next(tokens)
+ while nexttok == '!':
+ result += 'not '
+ nexttok = next(tokens)
+
+ if nexttok == '(':
+ sub, nexttok = _parse(tokens)
+ result = '%s(%s)' % (result, sub)
+ if nexttok != ')':
+ raise ValueError('unbalanced parenthesis in plural form')
+ elif nexttok == 'n':
+ result = '%s%s' % (result, nexttok)
+ else:
+ try:
+ value = int(nexttok, 10)
+ except ValueError:
+ raise _error(nexttok) from None
+ result = '%s%d' % (result, value)
+ nexttok = next(tokens)
+
+ j = 100
+ while nexttok in _binary_ops:
+ i = _binary_ops[nexttok]
+ if i < priority:
+ break
+ # Break chained comparisons
+ if i in (3, 4) and j in (3, 4): # '==', '!=', '<', '>', '<=', '>='
+ result = '(%s)' % result
+ # Replace some C operators by their Python equivalents
+ op = _c2py_ops.get(nexttok, nexttok)
+ right, nexttok = _parse(tokens, i + 1)
+ result = '%s %s %s' % (result, op, right)
+ j = i
+ if j == priority == 4: # '<', '>', '<=', '>='
+ result = '(%s)' % result
+
+ if nexttok == '?' and priority <= 0:
+ if_true, nexttok = _parse(tokens, 0)
+ if nexttok != ':':
+ raise _error(nexttok)
+ if_false, nexttok = _parse(tokens)
+ result = '%s if %s else %s' % (if_true, result, if_false)
+ if priority == 0:
+ result = '(%s)' % result
+
+ return result, nexttok
def c2py(plural):
"""Gets a C expression as used in PO files for plural forms and returns a
- Python lambda function that implements an equivalent expression.
+ Python function that implements an equivalent expression.
"""
- # Security check, allow only the "n" identifier
- import token, tokenize
- tokens = tokenize.generate_tokens(io.StringIO(plural).readline)
- try:
- danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
- except tokenize.TokenError:
- raise ValueError('plural forms expression error, maybe unbalanced parenthesis')
- else:
- if danger:
- raise ValueError('plural forms expression could be dangerous')
-
- # Replace some C operators by their Python equivalents
- plural = plural.replace('&&', ' and ')
- plural = plural.replace('||', ' or ')
-
- expr = re.compile(r'\!([^=])')
- plural = expr.sub(' not \\1', plural)
-
- # Regular expression and replacement function used to transform
- # "a?b:c" to "b if a else c".
- expr = re.compile(r'(.*?)\?(.*?):(.*)')
- def repl(x):
- return "(%s if %s else %s)" % (x.group(2), x.group(1),
- expr.sub(repl, x.group(3)))
-
- # Code to transform the plural expression, taking care of parentheses
- stack = ['']
- for c in plural:
- if c == '(':
- stack.append('')
- elif c == ')':
- if len(stack) == 1:
- # Actually, we never reach this code, because unbalanced
- # parentheses get caught in the security check at the
- # beginning.
- raise ValueError('unbalanced parenthesis in plural form')
- s = expr.sub(repl, stack.pop())
- stack[-1] += '(%s)' % s
- else:
- stack[-1] += c
- plural = expr.sub(repl, stack.pop())
-
- return eval('lambda n: int(%s)' % plural)
+ if len(plural) > 1000:
+ raise ValueError('plural form expression is too long')
+ try:
+ result, nexttok = _parse(_tokenize(plural))
+ if nexttok:
+ raise _error(nexttok)
+
+ depth = 0
+ for c in result:
+ if c == '(':
+ depth += 1
+ if depth > 20:
+ # Python compiler limit is about 90.
+ # The most complex example has 2.
+ raise ValueError('plural form expression is too complex')
+ elif c == ')':
+ depth -= 1
+
+ ns = {}
+ exec('''if True:
+ def func(n):
+ if not isinstance(n, int):
+ raise ValueError('Plural value must be an integer.')
+ return int(%s)
+ ''' % result, ns)
+ return ns['func']
+ except RuntimeError:
+ # Recursion error can be raised in _parse() or exec().
+ raise ValueError('plural form expression is too complex')
def _expand_lang(loc):