diff options
author | Brett Cannon <brett@python.org> | 2016-09-09 14:57:09 -0700 |
---|---|---|
committer | Brett Cannon <brett@python.org> | 2016-09-09 14:57:09 -0700 |
commit | a721abac299bb6529021000a71847486d531b41a (patch) | |
tree | 8355a69b891cfcdaad8a5fd62870231b7f940696 | |
parent | Merge heads (diff) | |
download | cpython-a721abac299bb6529021000a71847486d531b41a.tar.gz cpython-a721abac299bb6529021000a71847486d531b41a.tar.bz2 cpython-a721abac299bb6529021000a71847486d531b41a.zip |
Issue #26331: Implement the parsing part of PEP 515.
Thanks to Georg Brandl for the patch.
-rw-r--r-- | Doc/library/decimal.rst | 10 | ||||
-rw-r--r-- | Doc/library/functions.rst | 16 | ||||
-rw-r--r-- | Doc/reference/lexical_analysis.rst | 45 | ||||
-rw-r--r-- | Doc/whatsnew/3.6.rst | 23 | ||||
-rw-r--r-- | Include/pystrtod.h | 4 | ||||
-rw-r--r-- | Lib/_pydecimal.py | 10 | ||||
-rw-r--r-- | Lib/test/test_complex.py | 14 | ||||
-rw-r--r-- | Lib/test/test_decimal.py | 10 | ||||
-rw-r--r-- | Lib/test/test_float.py | 24 | ||||
-rw-r--r-- | Lib/test/test_grammar.py | 89 | ||||
-rw-r--r-- | Lib/test/test_int.py | 21 | ||||
-rw-r--r-- | Lib/test/test_tokenize.py | 30 | ||||
-rw-r--r-- | Lib/test/test_types.py | 1 | ||||
-rw-r--r-- | Lib/tokenize.py | 17 | ||||
-rw-r--r-- | Misc/NEWS | 6 | ||||
-rw-r--r-- | Modules/_decimal/_decimal.c | 12 | ||||
-rw-r--r-- | Objects/complexobject.c | 63 | ||||
-rw-r--r-- | Objects/floatobject.c | 59 | ||||
-rw-r--r-- | Objects/longobject.c | 169 | ||||
-rw-r--r-- | Parser/tokenizer.c | 230 | ||||
-rw-r--r-- | Python/ast.c | 27 | ||||
-rw-r--r-- | Python/pystrtod.c | 66 |
22 files changed, 742 insertions, 204 deletions
diff --git a/Doc/library/decimal.rst b/Doc/library/decimal.rst index ee746e933de..e984edcb754 100644 --- a/Doc/library/decimal.rst +++ b/Doc/library/decimal.rst @@ -345,7 +345,7 @@ Decimal objects *value* can be an integer, string, tuple, :class:`float`, or another :class:`Decimal` object. If no *value* is given, returns ``Decimal('0')``. If *value* is a string, it should conform to the decimal numeric string syntax after leading - and trailing whitespace characters are removed:: + and trailing whitespace characters, as well as underscores throughout, are removed:: sign ::= '+' | '-' digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' @@ -394,6 +394,10 @@ Decimal objects :class:`float` arguments raise an exception if the :exc:`FloatOperation` trap is set. By default the trap is off. + .. versionchanged:: 3.6 + Underscores are allowed for grouping, as with integral and floating-point + literals in code. + Decimal floating point objects share many properties with the other built-in numeric types such as :class:`float` and :class:`int`. All of the usual math operations and special methods apply. Likewise, decimal objects can be @@ -1075,8 +1079,8 @@ In addition to the three supplied contexts, new contexts can be created with the Decimal('4.44') This method implements the to-number operation of the IBM specification. - If the argument is a string, no leading or trailing whitespace is - permitted. + If the argument is a string, no leading or trailing whitespace or + underscores are permitted. .. method:: create_decimal_from_float(f) diff --git a/Doc/library/functions.rst b/Doc/library/functions.rst index db04b105c86..c4fcd98252e 100644 --- a/Doc/library/functions.rst +++ b/Doc/library/functions.rst @@ -271,6 +271,9 @@ are always available. They are listed here in alphabetical order. The complex type is described in :ref:`typesnumeric`. + .. versionchanged:: 3.6 + Grouping digits with underscores as in code literals is allowed. + .. function:: delattr(object, name) @@ -531,11 +534,14 @@ are always available. They are listed here in alphabetical order. The float type is described in :ref:`typesnumeric`. - .. index:: - single: __format__ - single: string; format() (built-in function) + .. versionchanged:: 3.6 + Grouping digits with underscores as in code literals is allowed. +.. index:: + single: __format__ + single: string; format() (built-in function) + .. function:: format(value[, format_spec]) Convert a *value* to a "formatted" representation, as controlled by @@ -702,6 +708,10 @@ are always available. They are listed here in alphabetical order. :meth:`base.__int__ <object.__int__>` instead of :meth:`base.__index__ <object.__index__>`. + .. versionchanged:: 3.6 + Grouping digits with underscores as in code literals is allowed. + + .. function:: isinstance(object, classinfo) Return true if the *object* argument is an instance of the *classinfo* diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst index 48f20434f01..a7c6a684c04 100644 --- a/Doc/reference/lexical_analysis.rst +++ b/Doc/reference/lexical_analysis.rst @@ -721,20 +721,24 @@ Integer literals Integer literals are described by the following lexical definitions: .. productionlist:: - integer: `decimalinteger` | `octinteger` | `hexinteger` | `bininteger` - decimalinteger: `nonzerodigit` `digit`* | "0"+ + integer: `decinteger` | `bininteger` | `octinteger` | `hexinteger` + decinteger: `nonzerodigit` (["_"] `digit`)* | "0"+ (["_"] "0")* + bininteger: "0" ("b" | "B") (["_"] `bindigit`)+ + octinteger: "0" ("o" | "O") (["_"] `octdigit`)+ + hexinteger: "0" ("x" | "X") (["_"] `hexdigit`)+ nonzerodigit: "1"..."9" digit: "0"..."9" - octinteger: "0" ("o" | "O") `octdigit`+ - hexinteger: "0" ("x" | "X") `hexdigit`+ - bininteger: "0" ("b" | "B") `bindigit`+ + bindigit: "0" | "1" octdigit: "0"..."7" hexdigit: `digit` | "a"..."f" | "A"..."F" - bindigit: "0" | "1" There is no limit for the length of integer literals apart from what can be stored in available memory. +Underscores are ignored for determining the numeric value of the literal. They +can be used to group digits for enhanced readability. One underscore can occur +between digits, and after base specifiers like ``0x``. + Note that leading zeros in a non-zero decimal number are not allowed. This is for disambiguation with C-style octal literals, which Python used before version 3.0. @@ -743,6 +747,10 @@ Some examples of integer literals:: 7 2147483647 0o177 0b100110111 3 79228162514264337593543950336 0o377 0xdeadbeef + 100_000_000_000 0b_1110_0101 + +.. versionchanged:: 3.6 + Underscores are now allowed for grouping purposes in literals. .. _floating: @@ -754,23 +762,28 @@ Floating point literals are described by the following lexical definitions: .. productionlist:: floatnumber: `pointfloat` | `exponentfloat` - pointfloat: [`intpart`] `fraction` | `intpart` "." - exponentfloat: (`intpart` | `pointfloat`) `exponent` - intpart: `digit`+ - fraction: "." `digit`+ - exponent: ("e" | "E") ["+" | "-"] `digit`+ + pointfloat: [`digitpart`] `fraction` | `digitpart` "." + exponentfloat: (`digitpart` | `pointfloat`) `exponent` + digitpart: `digit` (["_"] `digit`)* + fraction: "." `digitpart` + exponent: ("e" | "E") ["+" | "-"] `digitpart` Note that the integer and exponent parts are always interpreted using radix 10. For example, ``077e010`` is legal, and denotes the same number as ``77e10``. The -allowed range of floating point literals is implementation-dependent. Some -examples of floating point literals:: +allowed range of floating point literals is implementation-dependent. As in +integer literals, underscores are supported for digit grouping. + +Some examples of floating point literals:: - 3.14 10. .001 1e100 3.14e-10 0e0 + 3.14 10. .001 1e100 3.14e-10 0e0 3.14_15_93 Note that numeric literals do not include a sign; a phrase like ``-1`` is actually an expression composed of the unary operator ``-`` and the literal ``1``. +.. versionchanged:: 3.6 + Underscores are now allowed for grouping purposes in literals. + .. _imaginary: @@ -780,7 +793,7 @@ Imaginary literals Imaginary literals are described by the following lexical definitions: .. productionlist:: - imagnumber: (`floatnumber` | `intpart`) ("j" | "J") + imagnumber: (`floatnumber` | `digitpart`) ("j" | "J") An imaginary literal yields a complex number with a real part of 0.0. Complex numbers are represented as a pair of floating point numbers and have the same @@ -788,7 +801,7 @@ restrictions on their range. To create a complex number with a nonzero real part, add a floating point number to it, e.g., ``(3+4j)``. Some examples of imaginary literals:: - 3.14j 10.j 10j .001j 1e100j 3.14e-10j + 3.14j 10.j 10j .001j 1e100j 3.14e-10j 3.14_15_93j .. _operators: diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst index f15bf4d30b2..9a1648643ac 100644 --- a/Doc/whatsnew/3.6.rst +++ b/Doc/whatsnew/3.6.rst @@ -124,6 +124,29 @@ Windows improvements: New Features ============ +.. _pep-515: + +PEP 515: Underscores in Numeric Literals +======================================== + +Prior to PEP 515, there was no support for writing long numeric +literals with some form of separator to improve readability. For +instance, how big is ``1000000000000000```? With :pep:`515`, though, +you can use underscores to separate digits as desired to make numeric +literals easier to read: ``1_000_000_000_000_000``. Underscores can be +used with other numeric literals beyond integers, e.g. +``0x_FF_FF_FF_FF``. + +Single underscores are allowed between digits and after any base +specifier. More than a single underscore in a row, leading, or +trailing underscores are not allowed. + +.. seealso:: + + :pep:`523` - Underscores in Numeric Literals + PEP written by Georg Brandl & Serhiy Storchaka. + + .. _pep-523: PEP 523: Adding a frame evaluation API to CPython diff --git a/Include/pystrtod.h b/Include/pystrtod.h index 23fd1c62551..c1e84de6fe5 100644 --- a/Include/pystrtod.h +++ b/Include/pystrtod.h @@ -19,6 +19,10 @@ PyAPI_FUNC(char *) PyOS_double_to_string(double val, int *type); #ifndef Py_LIMITED_API +PyAPI_FUNC(PyObject *) _Py_string_to_number_with_underscores( + const char *str, Py_ssize_t len, const char *what, PyObject *obj, void *arg, + PyObject *(*innerfunc)(const char *, Py_ssize_t, void *)); + PyAPI_FUNC(double) _Py_parse_inf_or_nan(const char *p, char **endptr); #endif diff --git a/Lib/_pydecimal.py b/Lib/_pydecimal.py index 21e875c31c4..6318a49ce70 100644 --- a/Lib/_pydecimal.py +++ b/Lib/_pydecimal.py @@ -589,7 +589,7 @@ class Decimal(object): # From a string # REs insist on real strings, so we can too. if isinstance(value, str): - m = _parser(value.strip()) + m = _parser(value.strip().replace("_", "")) if m is None: if context is None: context = getcontext() @@ -4125,7 +4125,7 @@ class Context(object): This will make it round up for that operation. """ rounding = self.rounding - self.rounding= type + self.rounding = type return rounding def create_decimal(self, num='0'): @@ -4134,10 +4134,10 @@ class Context(object): This method implements the to-number operation of the IBM Decimal specification.""" - if isinstance(num, str) and num != num.strip(): + if isinstance(num, str) and (num != num.strip() or '_' in num): return self._raise_error(ConversionSyntax, - "no trailing or leading whitespace is " - "permitted.") + "trailing or leading whitespace and " + "underscores are not permitted.") d = Decimal(num, context=self) if d._isnan() and len(d._int) > self.prec - self.clamp: diff --git a/Lib/test/test_complex.py b/Lib/test/test_complex.py index 0ef9a7a1098..6633a7ae54b 100644 --- a/Lib/test/test_complex.py +++ b/Lib/test/test_complex.py @@ -1,5 +1,7 @@ import unittest from test import support +from test.test_grammar import (VALID_UNDERSCORE_LITERALS, + INVALID_UNDERSCORE_LITERALS) from random import random from math import atan2, isnan, copysign @@ -377,6 +379,18 @@ class ComplexTest(unittest.TestCase): self.assertAlmostEqual(complex(complex1(1j)), 2j) self.assertRaises(TypeError, complex, complex2(1j)) + def test_underscores(self): + # check underscores + for lit in VALID_UNDERSCORE_LITERALS: + if not any(ch in lit for ch in 'xXoObB'): + self.assertEqual(complex(lit), eval(lit)) + self.assertEqual(complex(lit), complex(lit.replace('_', ''))) + for lit in INVALID_UNDERSCORE_LITERALS: + if lit in ('0_7', '09_99'): # octals are not recognized here + continue + if not any(ch in lit for ch in 'xXoObB'): + self.assertRaises(ValueError, complex, lit) + def test_hash(self): for x in range(-30, 30): self.assertEqual(hash(x), hash(complex(x, 0))) diff --git a/Lib/test/test_decimal.py b/Lib/test/test_decimal.py index 7492f5466f0..617a37eec82 100644 --- a/Lib/test/test_decimal.py +++ b/Lib/test/test_decimal.py @@ -554,6 +554,10 @@ class ExplicitConstructionTest(unittest.TestCase): self.assertEqual(str(Decimal(' -7.89')), '-7.89') self.assertEqual(str(Decimal(" 3.45679 ")), '3.45679') + # underscores + self.assertEqual(str(Decimal('1_3.3e4_0')), '1.33E+41') + self.assertEqual(str(Decimal('1_0_0_0')), '1000') + # unicode whitespace for lead in ["", ' ', '\u00a0', '\u205f']: for trail in ["", ' ', '\u00a0', '\u205f']: @@ -578,6 +582,9 @@ class ExplicitConstructionTest(unittest.TestCase): # embedded NUL self.assertRaises(InvalidOperation, Decimal, "12\u00003") + # underscores don't prevent errors + self.assertRaises(InvalidOperation, Decimal, "1_2_\u00003") + @cpython_only def test_from_legacy_strings(self): import _testcapi @@ -772,6 +779,9 @@ class ExplicitConstructionTest(unittest.TestCase): self.assertRaises(InvalidOperation, nc.create_decimal, "xyz") self.assertRaises(ValueError, nc.create_decimal, (1, "xyz", -25)) self.assertRaises(TypeError, nc.create_decimal, "1234", "5678") + # no whitespace and underscore stripping is done with this method + self.assertRaises(InvalidOperation, nc.create_decimal, " 1234") + self.assertRaises(InvalidOperation, nc.create_decimal, "12_34") # too many NaN payload digits nc.prec = 3 diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py index 68b212e1959..ac8473db503 100644 --- a/Lib/test/test_float.py +++ b/Lib/test/test_float.py @@ -1,4 +1,3 @@ - import fractions import operator import os @@ -9,6 +8,8 @@ import time import unittest from test import support +from test.test_grammar import (VALID_UNDERSCORE_LITERALS, + INVALID_UNDERSCORE_LITERALS) from math import isinf, isnan, copysign, ldexp INF = float("inf") @@ -60,6 +61,27 @@ class GeneralFloatCases(unittest.TestCase): float(b'.' + b'1'*1000) float('.' + '1'*1000) + def test_underscores(self): + for lit in VALID_UNDERSCORE_LITERALS: + if not any(ch in lit for ch in 'jJxXoObB'): + self.assertEqual(float(lit), eval(lit)) + self.assertEqual(float(lit), float(lit.replace('_', ''))) + for lit in INVALID_UNDERSCORE_LITERALS: + if lit in ('0_7', '09_99'): # octals are not recognized here + continue + if not any(ch in lit for ch in 'jJxXoObB'): + self.assertRaises(ValueError, float, lit) + # Additional test cases; nan and inf are never valid as literals, + # only in the float() constructor, but we don't allow underscores + # in or around them. + self.assertRaises(ValueError, float, '_NaN') + self.assertRaises(ValueError, float, 'Na_N') + self.assertRaises(ValueError, float, 'IN_F') + self.assertRaises(ValueError, float, '-_INF') + self.assertRaises(ValueError, float, '-INF_') + # Check that we handle bytes values correctly. + self.assertRaises(ValueError, float, b'0_.\xff9') + def test_non_numeric_input_types(self): # Test possible non-numeric types for the argument x, including # subclasses of the explicitly documented accepted types. diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py index 109013f5e2f..914aa679441 100644 --- a/Lib/test/test_grammar.py +++ b/Lib/test/test_grammar.py @@ -16,6 +16,87 @@ from collections import ChainMap from test import ann_module2 import test +# These are shared with test_tokenize and other test modules. +# +# Note: since several test cases filter out floats by looking for "e" and ".", +# don't add hexadecimal literals that contain "e" or "E". +VALID_UNDERSCORE_LITERALS = [ + '0_0_0', + '4_2', + '1_0000_0000', + '0b1001_0100', + '0xffff_ffff', + '0o5_7_7', + '1_00_00.5', + '1_00_00.5e5', + '1_00_00e5_1', + '1e1_0', + '.1_4', + '.1_4e1', + '0b_0', + '0x_f', + '0o_5', + '1_00_00j', + '1_00_00.5j', + '1_00_00e5_1j', + '.1_4j', + '(1_2.5+3_3j)', + '(.5_6j)', +] +INVALID_UNDERSCORE_LITERALS = [ + # Trailing underscores: + '0_', + '42_', + '1.4j_', + '0x_', + '0b1_', + '0xf_', + '0o5_', + '0 if 1_Else 1', + # Underscores in the base selector: + '0_b0', + '0_xf', + '0_o5', + # Old-style octal, still disallowed: + '0_7', + '09_99', + # Multiple consecutive underscores: + '4_______2', + '0.1__4', + '0.1__4j', + '0b1001__0100', + '0xffff__ffff', + '0x___', + '0o5__77', + '1e1__0', + '1e1__0j', + # Underscore right before a dot: + '1_.4', + '1_.4j', + # Underscore right after a dot: + '1._4', + '1._4j', + '._5', + '._5j', + # Underscore right after a sign: + '1.0e+_1', + '1.0e+_1j', + # Underscore right before j: + '1.4_j', + '1.4e5_j', + # Underscore right before e: + '1_e1', + '1.4_e1', + '1.4_e1j', + # Underscore right after e: + '1e_1', + '1.4e_1', + '1.4e_1j', + # Complex cases with parens: + '(1+1.5_j_)', + '(1+1.5_j)', +] + class TokenTests(unittest.TestCase): @@ -95,6 +176,14 @@ class TokenTests(unittest.TestCase): self.assertEqual(1 if 0else 0, 0) self.assertRaises(SyntaxError, eval, "0 if 1Else 0") + def test_underscore_literals(self): + for lit in VALID_UNDERSCORE_LITERALS: + self.assertEqual(eval(lit), eval(lit.replace('_', ''))) + for lit in INVALID_UNDERSCORE_LITERALS: + self.assertRaises(SyntaxError, eval, lit) + # Sanity check: no literal begins with an underscore + self.assertRaises(NameError, eval, "_0") + def test_string_literals(self): x = ''; y = ""; self.assertTrue(len(x) == 0 and x == y) x = '\''; y = "'"; self.assertTrue(len(x) == 1 and x == y and ord(x) == 39) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 8847f4ce972..14bbd6192a0 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -2,6 +2,8 @@ import sys import unittest from test import support +from test.test_grammar import (VALID_UNDERSCORE_LITERALS, + INVALID_UNDERSCORE_LITERALS) L = [ ('0', 0), @@ -212,6 +214,25 @@ class IntTestCases(unittest.TestCase): self.assertEqual(int('2br45qc', 35), 4294967297) self.assertEqual(int('1z141z5', 36), 4294967297) + def test_underscores(self): + for lit in VALID_UNDERSCORE_LITERALS: + if any(ch in lit for ch in '.eEjJ'): + continue + self.assertEqual(int(lit, 0), eval(lit)) + self.assertEqual(int(lit, 0), int(lit.replace('_', ''), 0)) + for lit in INVALID_UNDERSCORE_LITERALS: + if any(ch in lit for ch in '.eEjJ'): + continue + self.assertRaises(ValueError, int, lit, 0) + # Additional test cases with bases != 0, only for the constructor: + self.assertEqual(int("1_00", 3), 9) + self.assertEqual(int("0_100"), 100) # not valid as a literal! + self.assertEqual(int(b"1_00"), 100) # byte underscore + self.assertRaises(ValueError, int, "_100") + self.assertRaises(ValueError, int, "+_100") + self.assertRaises(ValueError, int, "1__00") + self.assertRaises(ValueError, int, "100_") + @support.cpython_only def test_small_ints(self): # Bug #3236: Return small longs from PyLong_FromString diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 4c469a890f8..5a81a5f11a4 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -3,7 +3,9 @@ from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, open as tokenize_open, Untokenizer) from io import BytesIO -from unittest import TestCase, mock, main +from unittest import TestCase, mock +from test.test_grammar import (VALID_UNDERSCORE_LITERALS, + INVALID_UNDERSCORE_LITERALS) import os import token @@ -185,6 +187,21 @@ def k(x): NUMBER '3.14e159' (1, 4) (1, 12) """) + def test_underscore_literals(self): + def number_token(s): + f = BytesIO(s.encode('utf-8')) + for toktype, token, start, end, line in tokenize(f.readline): + if toktype == NUMBER: + return token + return 'invalid token' + for lit in VALID_UNDERSCORE_LITERALS: + if '(' in lit: + # this won't work with compound complex inputs + continue + self.assertEqual(number_token(lit), lit) + for lit in INVALID_UNDERSCORE_LITERALS: + self.assertNotEqual(number_token(lit), lit) + def test_string(self): # String literals self.check_tokenize("x = ''; y = \"\"", """\ @@ -1529,11 +1546,10 @@ class TestRoundtrip(TestCase): tempdir = os.path.dirname(fn) or os.curdir testfiles = glob.glob(os.path.join(tempdir, "test*.py")) - # Tokenize is broken on test_unicode_identifiers.py because regular - # expressions are broken on the obscure unicode identifiers in it. - # *sigh* With roundtrip extended to test the 5-tuple mode of - # untokenize, 7 more testfiles fail. Remove them also until the - # failure is diagnosed. + # Tokenize is broken on test_pep3131.py because regular expressions are + # broken on the obscure unicode identifiers in it. *sigh* + # With roundtrip extended to test the 5-tuple mode of untokenize, + # 7 more testfiles fail. Remove them also until the failure is diagnosed. testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py")) for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'): @@ -1565,4 +1581,4 @@ class TestRoundtrip(TestCase): if __name__ == "__main__": - main() + unittest.main() diff --git a/Lib/test/test_types.py b/Lib/test/test_types.py index a202196bd2f..382ca03e5ad 100644 --- a/Lib/test/test_types.py +++ b/Lib/test/test_types.py @@ -48,6 +48,7 @@ class TypesTests(unittest.TestCase): def test_float_constructor(self): self.assertRaises(ValueError, float, '') self.assertRaises(ValueError, float, '5\0') + self.assertRaises(ValueError, float, '5_5\0') def test_zero_division(self): try: 5.0 / 0.0 diff --git a/Lib/tokenize.py b/Lib/tokenize.py index ec79ec886da..825aa906460 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -120,16 +120,17 @@ Comment = r'#[^\r\n]*' Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) Name = r'\w+' -Hexnumber = r'0[xX][0-9a-fA-F]+' -Binnumber = r'0[bB][01]+' -Octnumber = r'0[oO][0-7]+' -Decnumber = r'(?:0+|[1-9][0-9]*)' +Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' +Binnumber = r'0[bB](?:_?[01])+' +Octnumber = r'0[oO](?:_?[0-7])+' +Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) -Exponent = r'[eE][-+]?[0-9]+' -Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) -Expfloat = r'[0-9]+' + Exponent +Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' +Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', + r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) +Expfloat = r'[0-9](?:_?[0-9])*' + Exponent Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') +Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') Number = group(Imagnumber, Floatnumber, Intnumber) # Return the empty string, plus all of the valid string prefixes. diff --git a/Misc/NEWS b/Misc/NEWS index c47c4bf5b0a..dbd43a319a3 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -17,6 +17,8 @@ Core and Builtins efficient bytecode. Patch by Demur Rumed, design by Serhiy Storchaka, reviewed by Serhiy Storchaka and Victor Stinner. +- Issue #26331: Implement tokenizing support for PEP 515. Patch by Georg Brandl. + - Issue #27999: Make "global after use" a SyntaxError, and ditto for nonlocal. Patch by Ivan Levkivskyi. @@ -2678,7 +2680,7 @@ Library - Issue #24774: Fix docstring in http.server.test. Patch from Chiu-Hsiang Hsu. - Issue #21159: Improve message in configparser.InterpolationMissingOptionError. - Patch from Å?ukasz Langa. + Patch from �?ukasz Langa. - Issue #20362: Honour TestCase.longMessage correctly in assertRegex. Patch from Ilia Kurenkov. @@ -4606,7 +4608,7 @@ Library Based on patch by Martin Panter. - Issue #17293: uuid.getnode() now determines MAC address on AIX using netstat. - Based on patch by Aivars KalvÄ?ns. + Based on patch by Aivars Kalv�?ns. - Issue #22769: Fixed ttk.Treeview.tag_has() when called without arguments. diff --git a/Modules/_decimal/_decimal.c b/Modules/_decimal/_decimal.c index 3ba8e35ce95..fcc1f151cf5 100644 --- a/Modules/_decimal/_decimal.c +++ b/Modules/_decimal/_decimal.c @@ -1889,12 +1889,13 @@ is_space(enum PyUnicode_Kind kind, void *data, Py_ssize_t pos) /* Return the ASCII representation of a numeric Unicode string. The numeric string may contain ascii characters in the range [1, 127], any Unicode space and any unicode digit. If strip_ws is true, leading and trailing - whitespace is stripped. + whitespace is stripped. If ignore_underscores is true, underscores are + ignored. Return NULL if malloc fails and an empty string if invalid characters are found. */ static char * -numeric_as_ascii(const PyObject *u, int strip_ws) +numeric_as_ascii(const PyObject *u, int strip_ws, int ignore_underscores) { enum PyUnicode_Kind kind; void *data; @@ -1929,6 +1930,9 @@ numeric_as_ascii(const PyObject *u, int strip_ws) for (; j < len; j++) { ch = PyUnicode_READ(kind, data, j); + if (ignore_underscores && ch == '_') { + continue; + } if (0 < ch && ch <= 127) { *cp++ = ch; continue; @@ -2011,7 +2015,7 @@ PyDecType_FromUnicode(PyTypeObject *type, const PyObject *u, PyObject *dec; char *s; - s = numeric_as_ascii(u, 0); + s = numeric_as_ascii(u, 0, 0); if (s == NULL) { return NULL; } @@ -2031,7 +2035,7 @@ PyDecType_FromUnicodeExactWS(PyTypeObject *type, const PyObject *u, PyObject *dec; char *s; - s = numeric_as_ascii(u, 1); + s = numeric_as_ascii(u, 1, 1); if (s == NULL) { return NULL; } diff --git a/Objects/complexobject.c b/Objects/complexobject.c index a5bfb667c46..a9d5ec301ae 100644 --- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -759,29 +759,12 @@ static PyMemberDef complex_members[] = { }; static PyObject * -complex_subtype_from_string(PyTypeObject *type, PyObject *v) +complex_from_string_inner(const char *s, Py_ssize_t len, void *type) { - const char *s, *start; - char *end; double x=0.0, y=0.0, z; int got_bracket=0; - PyObject *s_buffer = NULL; - Py_ssize_t len; - - if (PyUnicode_Check(v)) { - s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v); - if (s_buffer == NULL) - return NULL; - s = PyUnicode_AsUTF8AndSize(s_buffer, &len); - if (s == NULL) - goto error; - } - else { - PyErr_Format(PyExc_TypeError, - "complex() argument must be a string or a number, not '%.200s'", - Py_TYPE(v)->tp_name); - return NULL; - } + const char *start; + char *end; /* position on first nonblank */ start = s; @@ -822,7 +805,7 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v) if (PyErr_ExceptionMatches(PyExc_ValueError)) PyErr_Clear(); else - goto error; + return NULL; } if (end != s) { /* all 4 forms starting with <float> land here */ @@ -835,7 +818,7 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v) if (PyErr_ExceptionMatches(PyExc_ValueError)) PyErr_Clear(); else - goto error; + return NULL; } if (end != s) /* <float><signed-float>j */ @@ -890,18 +873,46 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v) if (s-start != len) goto parse_error; - Py_XDECREF(s_buffer); - return complex_subtype_from_doubles(type, x, y); + return complex_subtype_from_doubles((PyTypeObject *)type, x, y); parse_error: PyErr_SetString(PyExc_ValueError, "complex() arg is a malformed string"); - error: - Py_XDECREF(s_buffer); return NULL; } static PyObject * +complex_subtype_from_string(PyTypeObject *type, PyObject *v) +{ + const char *s; + PyObject *s_buffer = NULL, *result = NULL; + Py_ssize_t len; + + if (PyUnicode_Check(v)) { + s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v); + if (s_buffer == NULL) { + return NULL; + } + s = PyUnicode_AsUTF8AndSize(s_buffer, &len); + if (s == NULL) { + goto exit; + } + } + else { + PyErr_Format(PyExc_TypeError, + "complex() argument must be a string or a number, not '%.200s'", + Py_TYPE(v)->tp_name); + return NULL; + } + + result = _Py_string_to_number_with_underscores(s, len, "complex", v, type, + complex_from_string_inner); + exit: + Py_DECREF(s_buffer); + return result; +} + +static PyObject * complex_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { PyObject *r, *i, *tmp; diff --git a/Objects/floatobject.c b/Objects/floatobject.c index 0642b16ba1b..0f37618215c 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -124,11 +124,43 @@ PyFloat_FromDouble(double fval) return (PyObject *) op; } +static PyObject * +float_from_string_inner(const char *s, Py_ssize_t len, void *obj) +{ + double x; + const char *end; + const char *last = s + len; + /* strip space */ + while (s < last && Py_ISSPACE(*s)) { + s++; + } + + while (s < last - 1 && Py_ISSPACE(last[-1])) { + last--; + } + + /* We don't care about overflow or underflow. If the platform + * supports them, infinities and signed zeroes (on underflow) are + * fine. */ + x = PyOS_string_to_double(s, (char **)&end, NULL); + if (end != last) { + PyErr_Format(PyExc_ValueError, + "could not convert string to float: " + "%R", obj); + return NULL; + } + else if (x == -1.0 && PyErr_Occurred()) { + return NULL; + } + else { + return PyFloat_FromDouble(x); + } +} + PyObject * PyFloat_FromString(PyObject *v) { - const char *s, *last, *end; - double x; + const char *s; PyObject *s_buffer = NULL; Py_ssize_t len; Py_buffer view = {NULL, NULL}; @@ -169,27 +201,8 @@ PyFloat_FromString(PyObject *v) Py_TYPE(v)->tp_name); return NULL; } - last = s + len; - /* strip space */ - while (s < last && Py_ISSPACE(*s)) - s++; - while (s < last - 1 && Py_ISSPACE(last[-1])) - last--; - /* We don't care about overflow or underflow. If the platform - * supports them, infinities and signed zeroes (on underflow) are - * fine. */ - x = PyOS_string_to_double(s, (char **)&end, NULL); - if (end != last) { - PyErr_Format(PyExc_ValueError, - "could not convert string to float: " - "%R", v); - result = NULL; - } - else if (x == -1.0 && PyErr_Occurred()) - result = NULL; - else - result = PyFloat_FromDouble(x); - + result = _Py_string_to_number_with_underscores(s, len, "float", v, v, + float_from_string_inner); PyBuffer_Release(&view); Py_XDECREF(s_buffer); return result; diff --git a/Objects/longobject.c b/Objects/longobject.c index 740b7f58861..bbf7e7183eb 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -2004,12 +2004,18 @@ unsigned char _PyLong_DigitValue[256] = { * non-digit (which may be *str!). A normalized int is returned. * The point to this routine is that it takes time linear in the number of * string characters. + * + * Return values: + * -1 on syntax error (exception needs to be set, *res is untouched) + * 0 else (exception may be set, in that case *res is set to NULL) */ -static PyLongObject * -long_from_binary_base(const char **str, int base) +static int +long_from_binary_base(const char **str, int base, PyLongObject **res) { const char *p = *str; const char *start = p; + char prev = 0; + int digits = 0; int bits_per_char; Py_ssize_t n; PyLongObject *z; @@ -2019,23 +2025,43 @@ long_from_binary_base(const char **str, int base) assert(base >= 2 && base <= 32 && (base & (base - 1)) == 0); n = base; - for (bits_per_char = -1; n; ++bits_per_char) + for (bits_per_char = -1; n; ++bits_per_char) { n >>= 1; - /* n <- total # of bits needed, while setting p to end-of-string */ - while (_PyLong_DigitValue[Py_CHARMASK(*p)] < base) + } + /* count digits and set p to end-of-string */ + while (_PyLong_DigitValue[Py_CHARMASK(*p)] < base || *p == '_') { + if (*p == '_') { + if (prev == '_') { + *str = p - 1; + return -1; + } + } else { + ++digits; + } + prev = *p; ++p; + } + if (prev == '_') { + /* Trailing underscore not allowed. */ + *str = p - 1; + return -1; + } + *str = p; /* n <- # of Python digits needed, = ceiling(n/PyLong_SHIFT). */ - n = (p - start) * bits_per_char + PyLong_SHIFT - 1; + n = digits * bits_per_char + PyLong_SHIFT - 1; if (n / bits_per_char < p - start) { PyErr_SetString(PyExc_ValueError, "int string too large to convert"); - return NULL; + *res = NULL; + return 0; } n = n / PyLong_SHIFT; z = _PyLong_New(n); - if (z == NULL) - return NULL; + if (z == NULL) { + *res = NULL; + return 0; + } /* Read string from right, and fill in int from left; i.e., * from least to most significant in both. */ @@ -2043,7 +2069,11 @@ long_from_binary_base(const char **str, int base) bits_in_accum = 0; pdigit = z->ob_digit; while (--p >= start) { - int k = (int)_PyLong_DigitValue[Py_CHARMASK(*p)]; + int k; + if (*p == '_') { + continue; + } + k = (int)_PyLong_DigitValue[Py_CHARMASK(*p)]; assert(k >= 0 && k < base); accum |= (twodigits)k << bits_in_accum; bits_in_accum += bits_per_char; @@ -2062,7 +2092,8 @@ long_from_binary_base(const char **str, int base) } while (pdigit - z->ob_digit < n) *pdigit++ = 0; - return long_normalize(z); + *res = long_normalize(z); + return 0; } /* Parses an int from a bytestring. Leading and trailing whitespace will be @@ -2087,23 +2118,29 @@ PyLong_FromString(const char *str, char **pend, int base) "int() arg 2 must be >= 2 and <= 36"); return NULL; } - while (*str != '\0' && Py_ISSPACE(Py_CHARMASK(*str))) + while (*str != '\0' && Py_ISSPACE(Py_CHARMASK(*str))) { str++; - if (*str == '+') + } + if (*str == '+') { ++str; + } else if (*str == '-') { ++str; sign = -1; } if (base == 0) { - if (str[0] != '0') + if (str[0] != '0') { base = 10; - else if (str[1] == 'x' || str[1] == 'X') + } + else if (str[1] == 'x' || str[1] == 'X') { base = 16; - else if (str[1] == 'o' || str[1] == 'O') + } + else if (str[1] == 'o' || str[1] == 'O') { base = 8; - else if (str[1] == 'b' || str[1] == 'B') + } + else if (str[1] == 'b' || str[1] == 'B') { base = 2; + } else { /* "old" (C-style) octal literal, now invalid. it might still be zero though */ @@ -2114,12 +2151,26 @@ PyLong_FromString(const char *str, char **pend, int base) if (str[0] == '0' && ((base == 16 && (str[1] == 'x' || str[1] == 'X')) || (base == 8 && (str[1] == 'o' || str[1] == 'O')) || - (base == 2 && (str[1] == 'b' || str[1] == 'B')))) + (base == 2 && (str[1] == 'b' || str[1] == 'B')))) { str += 2; + /* One underscore allowed here. */ + if (*str == '_') { + ++str; + } + } + if (str[0] == '_') { + /* May not start with underscores. */ + goto onError; + } start = str; - if ((base & (base - 1)) == 0) - z = long_from_binary_base(&str, base); + if ((base & (base - 1)) == 0) { + int res = long_from_binary_base(&str, base, &z); + if (res < 0) { + /* Syntax error. */ + goto onError; + } + } else { /*** Binary bases can be converted in time linear in the number of digits, because @@ -2208,11 +2259,13 @@ digit beyond the first. ***/ twodigits c; /* current input character */ Py_ssize_t size_z; + int digits = 0; int i; int convwidth; twodigits convmultmax, convmult; digit *pz, *pzstop; - const char* scan; + const char *scan, *lastdigit; + char prev = 0; static double log_base_BASE[37] = {0.0e0,}; static int convwidth_base[37] = {0,}; @@ -2226,8 +2279,9 @@ digit beyond the first. log((double)PyLong_BASE)); for (;;) { twodigits next = convmax * base; - if (next > PyLong_BASE) + if (next > PyLong_BASE) { break; + } convmax = next; ++i; } @@ -2238,21 +2292,43 @@ digit beyond the first. /* Find length of the string of numeric characters. */ scan = str; - while (_PyLong_DigitValue[Py_CHARMASK(*scan)] < base) + lastdigit = str; + + while (_PyLong_DigitValue[Py_CHARMASK(*scan)] < base || *scan == '_') { + if (*scan == '_') { + if (prev == '_') { + /* Only one underscore allowed. */ + str = lastdigit + 1; + goto onError; + } + } + else { + ++digits; + lastdigit = scan; + } + prev = *scan; ++scan; + } + if (prev == '_') { + /* Trailing underscore not allowed. */ + /* Set error pointer to first underscore. */ + str = lastdigit + 1; + goto onError; + } /* Create an int object that can contain the largest possible * integer with this base and length. Note that there's no * need to initialize z->ob_digit -- no slot is read up before * being stored into. */ - size_z = (Py_ssize_t)((scan - str) * log_base_BASE[base]) + 1; + size_z = (Py_ssize_t)(digits * log_base_BASE[base]) + 1; /* Uncomment next line to test exceedingly rare copy code */ /* size_z = 1; */ assert(size_z > 0); z = _PyLong_New(size_z); - if (z == NULL) + if (z == NULL) { return NULL; + } Py_SIZE(z) = 0; /* `convwidth` consecutive input digits are treated as a single @@ -2263,9 +2339,17 @@ digit beyond the first. /* Work ;-) */ while (str < scan) { + if (*str == '_') { + str++; + continue; + } /* grab up to convwidth digits from the input string */ c = (digit)_PyLong_DigitValue[Py_CHARMASK(*str++)]; - for (i = 1; i < convwidth && str != scan; ++i, ++str) { + for (i = 1; i < convwidth && str != scan; ++str) { + if (*str == '_') { + continue; + } + i++; c = (twodigits)(c * base + (int)_PyLong_DigitValue[Py_CHARMASK(*str)]); assert(c < PyLong_BASE); @@ -2277,8 +2361,9 @@ digit beyond the first. */ if (i != convwidth) { convmult = base; - for ( ; i > 1; --i) + for ( ; i > 1; --i) { convmult *= base; + } } /* Multiply z by convmult, and add c. */ @@ -2316,41 +2401,51 @@ digit beyond the first. } } } - if (z == NULL) + if (z == NULL) { return NULL; + } if (error_if_nonzero) { /* reset the base to 0, else the exception message doesn't make too much sense */ base = 0; - if (Py_SIZE(z) != 0) + if (Py_SIZE(z) != 0) { goto onError; + } /* there might still be other problems, therefore base remains zero here for the same reason */ } - if (str == start) + if (str == start) { goto onError; - if (sign < 0) + } + if (sign < 0) { Py_SIZE(z) = -(Py_SIZE(z)); - while (*str && Py_ISSPACE(Py_CHARMASK(*str))) + } + while (*str && Py_ISSPACE(Py_CHARMASK(*str))) { str++; - if (*str != '\0') + } + if (*str != '\0') { goto onError; + } long_normalize(z); z = maybe_small_long(z); - if (z == NULL) + if (z == NULL) { return NULL; - if (pend != NULL) + } + if (pend != NULL) { *pend = (char *)str; + } return (PyObject *) z; onError: - if (pend != NULL) + if (pend != NULL) { *pend = (char *)str; + } Py_XDECREF(z); slen = strlen(orig_str) < 200 ? strlen(orig_str) : 200; strobj = PyUnicode_FromStringAndSize(orig_str, slen); - if (strobj == NULL) + if (strobj == NULL) { return NULL; + } PyErr_Format(PyExc_ValueError, "invalid literal for int() with base %d: %.200R", base, strobj); diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index d1e5d352692..a29ba472aa3 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1333,6 +1333,28 @@ verify_identifier(struct tok_state *tok) } #endif +static int +tok_decimal_tail(struct tok_state *tok) +{ + int c; + + while (1) { + do { + c = tok_nextc(tok); + } while (isdigit(c)); + if (c != '_') { + break; + } + c = tok_nextc(tok); + if (!isdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return 0; + } + } + return c; +} + /* Get next token, after space stripping etc. */ static int @@ -1353,17 +1375,20 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok->atbol = 0; for (;;) { c = tok_nextc(tok); - if (c == ' ') + if (c == ' ') { col++, altcol++; + } else if (c == '\t') { col = (col/tok->tabsize + 1) * tok->tabsize; altcol = (altcol/tok->alttabsize + 1) * tok->alttabsize; } - else if (c == '\014') /* Control-L (formfeed) */ + else if (c == '\014') {/* Control-L (formfeed) */ col = altcol = 0; /* For Emacs users */ - else + } + else { break; + } } tok_backup(tok, c); if (c == '#' || c == '\n') { @@ -1372,10 +1397,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive mode, which signal the end of a command group. */ - if (col == 0 && c == '\n' && tok->prompt != NULL) + if (col == 0 && c == '\n' && tok->prompt != NULL) { blankline = 0; /* Let it through */ - else + } + else { blankline = 1; /* Ignore completely */ + } /* We can't jump back right here since we still may need to skip to the end of a comment */ } @@ -1383,8 +1410,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { - if (indenterror(tok)) + if (indenterror(tok)) { return ERRORTOKEN; + } } } else if (col > tok->indstack[tok->indent]) { @@ -1395,8 +1423,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) return ERRORTOKEN; } if (altcol <= tok->altindstack[tok->indent]) { - if (indenterror(tok)) + if (indenterror(tok)) { return ERRORTOKEN; + } } tok->pendin++; tok->indstack[++tok->indent] = col; @@ -1415,8 +1444,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) return ERRORTOKEN; } if (altcol != tok->altindstack[tok->indent]) { - if (indenterror(tok)) + if (indenterror(tok)) { return ERRORTOKEN; + } } } } @@ -1462,9 +1492,11 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok->start = tok->cur - 1; /* Skip comment */ - if (c == '#') - while (c != EOF && c != '\n') + if (c == '#') { + while (c != EOF && c != '\n') { c = tok_nextc(tok); + } + } /* Check for EOF and errors now */ if (c == EOF) { @@ -1481,27 +1513,35 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) saw_b = 1; /* Since this is a backwards compatibility support literal we don't want to support it in arbitrary order like byte literals. */ - else if (!(saw_b || saw_u || saw_r || saw_f) && (c == 'u' || c == 'U')) + else if (!(saw_b || saw_u || saw_r || saw_f) + && (c == 'u'|| c == 'U')) { saw_u = 1; + } /* ur"" and ru"" are not supported */ - else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) + else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { saw_r = 1; - else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) + } + else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { saw_f = 1; - else + } + else { break; + } c = tok_nextc(tok); - if (c == '"' || c == '\'') + if (c == '"' || c == '\'') { goto letter_quote; + } } while (is_potential_identifier_char(c)) { - if (c >= 128) + if (c >= 128) { nonascii = 1; + } c = tok_nextc(tok); } tok_backup(tok, c); - if (nonascii && !verify_identifier(tok)) + if (nonascii && !verify_identifier(tok)) { return ERRORTOKEN; + } *p_start = tok->start; *p_end = tok->cur; @@ -1510,10 +1550,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) /* Current token length is 5. */ if (tok->async_def) { /* We're inside an 'async def' function. */ - if (memcmp(tok->start, "async", 5) == 0) + if (memcmp(tok->start, "async", 5) == 0) { return ASYNC; - if (memcmp(tok->start, "await", 5) == 0) + } + if (memcmp(tok->start, "await", 5) == 0) { return AWAIT; + } } else if (memcmp(tok->start, "async", 5) == 0) { /* The current token is 'async'. @@ -1546,8 +1588,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) /* Newline */ if (c == '\n') { tok->atbol = 1; - if (blankline || tok->level > 0) + if (blankline || tok->level > 0) { goto nextline; + } *p_start = tok->start; *p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; @@ -1570,11 +1613,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) *p_start = tok->start; *p_end = tok->cur; return ELLIPSIS; - } else { + } + else { tok_backup(tok, c); } tok_backup(tok, '.'); - } else { + } + else { tok_backup(tok, c); } *p_start = tok->start; @@ -1588,59 +1633,93 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); if (c == 'x' || c == 'X') { - /* Hex */ c = tok_nextc(tok); - if (!isxdigit(c)) { - tok->done = E_TOKEN; - tok_backup(tok, c); - return ERRORTOKEN; - } do { - c = tok_nextc(tok); - } while (isxdigit(c)); + if (c == '_') { + c = tok_nextc(tok); + } + if (!isxdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while (isxdigit(c)); + } while (c == '_'); } else if (c == 'o' || c == 'O') { /* Octal */ c = tok_nextc(tok); - if (c < '0' || c >= '8') { - tok->done = E_TOKEN; - tok_backup(tok, c); - return ERRORTOKEN; - } do { - c = tok_nextc(tok); - } while ('0' <= c && c < '8'); + if (c == '_') { + c = tok_nextc(tok); + } + if (c < '0' || c >= '8') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while ('0' <= c && c < '8'); + } while (c == '_'); } else if (c == 'b' || c == 'B') { /* Binary */ c = tok_nextc(tok); - if (c != '0' && c != '1') { - tok->done = E_TOKEN; - tok_backup(tok, c); - return ERRORTOKEN; - } do { - c = tok_nextc(tok); - } while (c == '0' || c == '1'); + if (c == '_') { + c = tok_nextc(tok); + } + if (c != '0' && c != '1') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while (c == '0' || c == '1'); + } while (c == '_'); } else { int nonzero = 0; /* maybe old-style octal; c is first char of it */ /* in any case, allow '0' as a literal */ - while (c == '0') + while (1) { + if (c == '_') { + c = tok_nextc(tok); + if (!isdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + } + if (c != '0') { + break; + } c = tok_nextc(tok); - while (isdigit(c)) { + } + if (isdigit(c)) { nonzero = 1; - c = tok_nextc(tok); + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } } - if (c == '.') + if (c == '.') { + c = tok_nextc(tok); goto fraction; - else if (c == 'e' || c == 'E') + } + else if (c == 'e' || c == 'E') { goto exponent; - else if (c == 'j' || c == 'J') + } + else if (c == 'j' || c == 'J') { goto imaginary; + } else if (nonzero) { + /* Old-style octal: now disallowed. */ tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; @@ -1649,17 +1728,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } else { /* Decimal */ - do { - c = tok_nextc(tok); - } while (isdigit(c)); + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } { /* Accept floating point numbers. */ if (c == '.') { + c = tok_nextc(tok); fraction: /* Fraction */ - do { - c = tok_nextc(tok); - } while (isdigit(c)); + if (isdigit(c)) { + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } + } } if (c == 'e' || c == 'E') { int e; @@ -1681,14 +1765,16 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) *p_end = tok->cur; return NUMBER; } - do { - c = tok_nextc(tok); - } while (isdigit(c)); + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } } - if (c == 'j' || c == 'J') + if (c == 'j' || c == 'J') { /* Imaginary part */ imaginary: c = tok_nextc(tok); + } } } tok_backup(tok, c); @@ -1708,22 +1794,27 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) c = tok_nextc(tok); if (c == quote) { c = tok_nextc(tok); - if (c == quote) + if (c == quote) { quote_size = 3; - else + } + else { end_quote_size = 1; /* empty string found */ + } } - if (c != quote) + if (c != quote) { tok_backup(tok, c); + } /* Get rest of string */ while (end_quote_size != quote_size) { c = tok_nextc(tok); if (c == EOF) { - if (quote_size == 3) + if (quote_size == 3) { tok->done = E_EOFS; - else + } + else { tok->done = E_EOLS; + } tok->cur = tok->inp; return ERRORTOKEN; } @@ -1732,12 +1823,14 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok->cur = tok->inp; return ERRORTOKEN; } - if (c == quote) + if (c == quote) { end_quote_size += 1; + } else { end_quote_size = 0; - if (c == '\\') + if (c == '\\') { tok_nextc(tok); /* skip escaped char */ + } } } @@ -1767,7 +1860,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) int token3 = PyToken_ThreeChars(c, c2, c3); if (token3 != OP) { token = token3; - } else { + } + else { tok_backup(tok, c3); } *p_start = tok->start; diff --git a/Python/ast.c b/Python/ast.c index 37193329c86..dcaa697a38d 100644 --- a/Python/ast.c +++ b/Python/ast.c @@ -4018,7 +4018,7 @@ ast_for_stmt(struct compiling *c, const node *n) } static PyObject * -parsenumber(struct compiling *c, const char *s) +parsenumber_raw(struct compiling *c, const char *s) { const char *end; long x; @@ -4061,6 +4061,31 @@ parsenumber(struct compiling *c, const char *s) } static PyObject * +parsenumber(struct compiling *c, const char *s) +{ + char *dup, *end; + PyObject *res = NULL; + + assert(s != NULL); + + if (strchr(s, '_') == NULL) { + return parsenumber_raw(c, s); + } + /* Create a duplicate without underscores. */ + dup = PyMem_Malloc(strlen(s) + 1); + end = dup; + for (; *s; s++) { + if (*s != '_') { + *end++ = *s; + } + } + *end = '\0'; + res = parsenumber_raw(c, dup); + PyMem_Free(dup); + return res; +} + +static PyObject * decode_utf8(struct compiling *c, const char **sPtr, const char *end) { const char *s, *t; diff --git a/Python/pystrtod.c b/Python/pystrtod.c index 5f3af92dca6..64d0c52e487 100644 --- a/Python/pystrtod.c +++ b/Python/pystrtod.c @@ -370,6 +370,72 @@ PyOS_string_to_double(const char *s, return result; } +/* Remove underscores that follow the underscore placement rule from + the string and then call the `innerfunc` function on the result. + It should return a new object or NULL on exception. + + `what` is used for the error message emitted when underscores are detected + that don't follow the rule. `arg` is an opaque pointer passed to the inner + function. + + This is used to implement underscore-agnostic conversion for floats + and complex numbers. +*/ +PyObject * +_Py_string_to_number_with_underscores( + const char *s, Py_ssize_t orig_len, const char *what, PyObject *obj, void *arg, + PyObject *(*innerfunc)(const char *, Py_ssize_t, void *)) +{ + char prev; + const char *p, *last; + char *dup, *end; + PyObject *result; + + if (strchr(s, '_') == NULL) { + return innerfunc(s, orig_len, arg); + } + + dup = PyMem_Malloc(orig_len + 1); + end = dup; + prev = '\0'; + last = s + orig_len; + for (p = s; *p; p++) { + if (*p == '_') { + /* Underscores are only allowed after digits. */ + if (!(prev >= '0' && prev <= '9')) { + goto error; + } + } + else { + *end++ = *p; + /* Underscores are only allowed before digits. */ + if (prev == '_' && !(*p >= '0' && *p <= '9')) { + goto error; + } + } + prev = *p; + } + /* Underscores are not allowed at the end. */ + if (prev == '_') { + goto error; + } + /* No embedded NULs allowed. */ + if (p != last) { + goto error; + } + *end = '\0'; + result = innerfunc(dup, end - dup, arg); + PyMem_Free(dup); + return result; + + error: + PyMem_Free(dup); + PyErr_Format(PyExc_ValueError, + "could not convert string to %s: " + "%R", what, obj); + return NULL; +} + #ifdef PY_NO_SHORT_FLOAT_REPR /* Given a string that may have a decimal point in the current |