Issue #26331: Implement the parsing part of PEP 515.

Thanks to Georg Brandl for the patch.
author: Brett Cannon <brett@python.org> 2016-09-09 14:57:09 -0700
committer: Brett Cannon <brett@python.org> 2016-09-09 14:57:09 -0700
commit: a721abac299bb6529021000a71847486d531b41a (patch)
tree: 8355a69b891cfcdaad8a5fd62870231b7f940696 /Lib
parent: Merge heads (diff)
download: cpython-a721abac299bb6529021000a71847486d531b41a.tar.gz
cpython-a721abac299bb6529021000a71847486d531b41a.tar.bz2
cpython-a721abac299bb6529021000a71847486d531b41a.zip
9 files changed, 195 insertions, 21 deletions
diff --git a/Lib/_pydecimal.py b/Lib/_pydecimal.py
index 21e875c31c4..6318a49ce70 100644
--- a/Lib/_pydecimal.py
+++ b/Lib/_pydecimal.py
@@ -589,7 +589,7 @@ class Decimal(object):
         # From a string
         # REs insist on real strings, so we can too.
         if isinstance(value, str):
-            m = _parser(value.strip())
+            m = _parser(value.strip().replace("_", ""))
             if m is None:
                 if context is None:
                     context = getcontext()
@@ -4125,7 +4125,7 @@ class Context(object):
         This will make it round up for that operation.
         """
         rounding = self.rounding
-        self.rounding= type
+        self.rounding = type
         return rounding
 
     def create_decimal(self, num='0'):
@@ -4134,10 +4134,10 @@ class Context(object):
         This method implements the to-number operation of the
         IBM Decimal specification."""
 
-        if isinstance(num, str) and num != num.strip():
+        if isinstance(num, str) and (num != num.strip() or '_' in num):
             return self._raise_error(ConversionSyntax,
-                                     "no trailing or leading whitespace is "
-                                     "permitted.")
+                                     "trailing or leading whitespace and "
+                                     "underscores are not permitted.")
 
         d = Decimal(num, context=self)
         if d._isnan() and len(d._int) > self.prec - self.clamp:
diff --git a/Lib/test/test_complex.py b/Lib/test/test_complex.py
index 0ef9a7a1098..6633a7ae54b 100644
--- a/Lib/test/test_complex.py
+++ b/Lib/test/test_complex.py
@@ -1,5 +1,7 @@
 import unittest
 from test import support
+from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
+                               INVALID_UNDERSCORE_LITERALS)
 
 from random import random
 from math import atan2, isnan, copysign
@@ -377,6 +379,18 @@ class ComplexTest(unittest.TestCase):
         self.assertAlmostEqual(complex(complex1(1j)), 2j)
         self.assertRaises(TypeError, complex, complex2(1j))
 
+    def test_underscores(self):
+        # check underscores
+        for lit in VALID_UNDERSCORE_LITERALS:
+            if not any(ch in lit for ch in 'xXoObB'):
+                self.assertEqual(complex(lit), eval(lit))
+                self.assertEqual(complex(lit), complex(lit.replace('_', '')))
+        for lit in INVALID_UNDERSCORE_LITERALS:
+            if lit in ('0_7', '09_99'):  # octals are not recognized here
+                continue
+            if not any(ch in lit for ch in 'xXoObB'):
+                self.assertRaises(ValueError, complex, lit)
+
     def test_hash(self):
         for x in range(-30, 30):
             self.assertEqual(hash(x), hash(complex(x, 0)))
diff --git a/Lib/test/test_decimal.py b/Lib/test/test_decimal.py
index 7492f5466f0..617a37eec82 100644
--- a/Lib/test/test_decimal.py
+++ b/Lib/test/test_decimal.py
@@ -554,6 +554,10 @@ class ExplicitConstructionTest(unittest.TestCase):
         self.assertEqual(str(Decimal('  -7.89')), '-7.89')
         self.assertEqual(str(Decimal("  3.45679  ")), '3.45679')
 
+        # underscores
+        self.assertEqual(str(Decimal('1_3.3e4_0')), '1.33E+41')
+        self.assertEqual(str(Decimal('1_0_0_0')), '1000')
+
         # unicode whitespace
         for lead in ["", ' ', '\u00a0', '\u205f']:
             for trail in ["", ' ', '\u00a0', '\u205f']:
@@ -578,6 +582,9 @@ class ExplicitConstructionTest(unittest.TestCase):
             # embedded NUL
             self.assertRaises(InvalidOperation, Decimal, "12\u00003")
 
+            # underscores don't prevent errors
+            self.assertRaises(InvalidOperation, Decimal, "1_2_\u00003")
+
     @cpython_only
     def test_from_legacy_strings(self):
         import _testcapi
@@ -772,6 +779,9 @@ class ExplicitConstructionTest(unittest.TestCase):
         self.assertRaises(InvalidOperation, nc.create_decimal, "xyz")
         self.assertRaises(ValueError, nc.create_decimal, (1, "xyz", -25))
         self.assertRaises(TypeError, nc.create_decimal, "1234", "5678")
+        # no whitespace and underscore stripping is done with this method
+        self.assertRaises(InvalidOperation, nc.create_decimal, " 1234")
+        self.assertRaises(InvalidOperation, nc.create_decimal, "12_34")
 
         # too many NaN payload digits
         nc.prec = 3
diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py
index 68b212e1959..ac8473db503 100644
--- a/Lib/test/test_float.py
+++ b/Lib/test/test_float.py
@@ -1,4 +1,3 @@
-
 import fractions
 import operator
 import os
@@ -9,6 +8,8 @@ import time
 import unittest
 
 from test import support
+from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
+                               INVALID_UNDERSCORE_LITERALS)
 from math import isinf, isnan, copysign, ldexp
 
 INF = float("inf")
@@ -60,6 +61,27 @@ class GeneralFloatCases(unittest.TestCase):
         float(b'.' + b'1'*1000)
         float('.' + '1'*1000)
 
+    def test_underscores(self):
+        for lit in VALID_UNDERSCORE_LITERALS:
+            if not any(ch in lit for ch in 'jJxXoObB'):
+                self.assertEqual(float(lit), eval(lit))
+                self.assertEqual(float(lit), float(lit.replace('_', '')))
+        for lit in INVALID_UNDERSCORE_LITERALS:
+            if lit in ('0_7', '09_99'):  # octals are not recognized here
+                continue
+            if not any(ch in lit for ch in 'jJxXoObB'):
+                self.assertRaises(ValueError, float, lit)
+        # Additional test cases; nan and inf are never valid as literals,
+        # only in the float() constructor, but we don't allow underscores
+        # in or around them.
+        self.assertRaises(ValueError, float, '_NaN')
+        self.assertRaises(ValueError, float, 'Na_N')
+        self.assertRaises(ValueError, float, 'IN_F')
+        self.assertRaises(ValueError, float, '-_INF')
+        self.assertRaises(ValueError, float, '-INF_')
+        # Check that we handle bytes values correctly.
+        self.assertRaises(ValueError, float, b'0_.\xff9')
+
     def test_non_numeric_input_types(self):
         # Test possible non-numeric types for the argument x, including
         # subclasses of the explicitly documented accepted types.
diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py
index 109013f5e2f..914aa679441 100644
--- a/Lib/test/test_grammar.py
+++ b/Lib/test/test_grammar.py
@@ -16,6 +16,87 @@ from collections import ChainMap
 from test import ann_module2
 import test
 
+# These are shared with test_tokenize and other test modules.
+#
+# Note: since several test cases filter out floats by looking for "e" and ".",
+# don't add hexadecimal literals that contain "e" or "E".
+VALID_UNDERSCORE_LITERALS = [
+    '0_0_0',
+    '4_2',
+    '1_0000_0000',
+    '0b1001_0100',
+    '0xffff_ffff',
+    '0o5_7_7',
+    '1_00_00.5',
+    '1_00_00.5e5',
+    '1_00_00e5_1',
+    '1e1_0',
+    '.1_4',
+    '.1_4e1',
+    '0b_0',
+    '0x_f',
+    '0o_5',
+    '1_00_00j',
+    '1_00_00.5j',
+    '1_00_00e5_1j',
+    '.1_4j',
+    '(1_2.5+3_3j)',
+    '(.5_6j)',
+]
+INVALID_UNDERSCORE_LITERALS = [
+    # Trailing underscores:
+    '0_',
+    '42_',
+    '1.4j_',
+    '0x_',
+    '0b1_',
+    '0xf_',
+    '0o5_',
+    '0 if 1_Else 1',
+    # Underscores in the base selector:
+    '0_b0',
+    '0_xf',
+    '0_o5',
+    # Old-style octal, still disallowed:
+    '0_7',
+    '09_99',
+    # Multiple consecutive underscores:
+    '4_______2',
+    '0.1__4',
+    '0.1__4j',
+    '0b1001__0100',
+    '0xffff__ffff',
+    '0x___',
+    '0o5__77',
+    '1e1__0',
+    '1e1__0j',
+    # Underscore right before a dot:
+    '1_.4',
+    '1_.4j',
+    # Underscore right after a dot:
+    '1._4',
+    '1._4j',
+    '._5',
+    '._5j',
+    # Underscore right after a sign:
+    '1.0e+_1',
+    '1.0e+_1j',
+    # Underscore right before j:
+    '1.4_j',
+    '1.4e5_j',
+    # Underscore right before e:
+    '1_e1',
+    '1.4_e1',
+    '1.4_e1j',
+    # Underscore right after e:
+    '1e_1',
+    '1.4e_1',
+    '1.4e_1j',
+    # Complex cases with parens:
+    '(1+1.5_j_)',
+    '(1+1.5_j)',
+]
+
 
 class TokenTests(unittest.TestCase):
 
@@ -95,6 +176,14 @@ class TokenTests(unittest.TestCase):
         self.assertEqual(1 if 0else 0, 0)
         self.assertRaises(SyntaxError, eval, "0 if 1Else 0")
 
+    def test_underscore_literals(self):
+        for lit in VALID_UNDERSCORE_LITERALS:
+            self.assertEqual(eval(lit), eval(lit.replace('_', '')))
+        for lit in INVALID_UNDERSCORE_LITERALS:
+            self.assertRaises(SyntaxError, eval, lit)
+        # Sanity check: no literal begins with an underscore
+        self.assertRaises(NameError, eval, "_0")
+
     def test_string_literals(self):
         x = ''; y = ""; self.assertTrue(len(x) == 0 and x == y)
         x = '\''; y = "'"; self.assertTrue(len(x) == 1 and x == y and ord(x) == 39)
diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py
index 8847f4ce972..14bbd6192a0 100644
--- a/Lib/test/test_int.py
+++ b/Lib/test/test_int.py
@@ -2,6 +2,8 @@ import sys
 
 import unittest
 from test import support
+from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
+                               INVALID_UNDERSCORE_LITERALS)
 
 L = [
         ('0', 0),
@@ -212,6 +214,25 @@ class IntTestCases(unittest.TestCase):
         self.assertEqual(int('2br45qc', 35), 4294967297)
         self.assertEqual(int('1z141z5', 36), 4294967297)
 
+    def test_underscores(self):
+        for lit in VALID_UNDERSCORE_LITERALS:
+            if any(ch in lit for ch in '.eEjJ'):
+                continue
+            self.assertEqual(int(lit, 0), eval(lit))
+            self.assertEqual(int(lit, 0), int(lit.replace('_', ''), 0))
+        for lit in INVALID_UNDERSCORE_LITERALS:
+            if any(ch in lit for ch in '.eEjJ'):
+                continue
+            self.assertRaises(ValueError, int, lit, 0)
+        # Additional test cases with bases != 0, only for the constructor:
+        self.assertEqual(int("1_00", 3), 9)
+        self.assertEqual(int("0_100"), 100)  # not valid as a literal!
+        self.assertEqual(int(b"1_00"), 100)  # byte underscore
+        self.assertRaises(ValueError, int, "_100")
+        self.assertRaises(ValueError, int, "+_100")
+        self.assertRaises(ValueError, int, "1__00")
+        self.assertRaises(ValueError, int, "100_")
+
     @support.cpython_only
     def test_small_ints(self):
         # Bug #3236: Return small longs from PyLong_FromString
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 4c469a890f8..5a81a5f11a4 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -3,7 +3,9 @@ from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
                      STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                      open as tokenize_open, Untokenizer)
 from io import BytesIO
-from unittest import TestCase, mock, main
+from unittest import TestCase, mock
+from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
+                               INVALID_UNDERSCORE_LITERALS)
 import os
 import token
 
@@ -185,6 +187,21 @@ def k(x):
     NUMBER     '3.14e159'    (1, 4) (1, 12)
     """)
 
+    def test_underscore_literals(self):
+        def number_token(s):
+            f = BytesIO(s.encode('utf-8'))
+            for toktype, token, start, end, line in tokenize(f.readline):
+                if toktype == NUMBER:
+                    return token
+            return 'invalid token'
+        for lit in VALID_UNDERSCORE_LITERALS:
+            if '(' in lit:
+                # this won't work with compound complex inputs
+                continue
+            self.assertEqual(number_token(lit), lit)
+        for lit in INVALID_UNDERSCORE_LITERALS:
+            self.assertNotEqual(number_token(lit), lit)
+
     def test_string(self):
         # String literals
         self.check_tokenize("x = ''; y = \"\"", """\
@@ -1529,11 +1546,10 @@ class TestRoundtrip(TestCase):
         tempdir = os.path.dirname(fn) or os.curdir
         testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
 
-        # Tokenize is broken on test_unicode_identifiers.py because regular
-        # expressions are broken on the obscure unicode identifiers in it.
-        # *sigh* With roundtrip extended to test the 5-tuple mode of
-        # untokenize, 7 more testfiles fail.  Remove them also until the
-        # failure is diagnosed.
+        # Tokenize is broken on test_pep3131.py because regular expressions are
+        # broken on the obscure unicode identifiers in it. *sigh*
+        # With roundtrip extended to test the 5-tuple mode of untokenize,
+        # 7 more testfiles fail.  Remove them also until the failure is diagnosed.
 
         testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
         for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
@@ -1565,4 +1581,4 @@ class TestRoundtrip(TestCase):
 
 
 if __name__ == "__main__":
-    main()
+    unittest.main()
diff --git a/Lib/test/test_types.py b/Lib/test/test_types.py
index a202196bd2f..382ca03e5ad 100644
--- a/Lib/test/test_types.py
+++ b/Lib/test/test_types.py
@@ -48,6 +48,7 @@ class TypesTests(unittest.TestCase):
     def test_float_constructor(self):
         self.assertRaises(ValueError, float, '')
         self.assertRaises(ValueError, float, '5\0')
+        self.assertRaises(ValueError, float, '5_5\0')
 
     def test_zero_division(self):
         try: 5.0 / 0.0
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index ec79ec886da..825aa906460 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -120,16 +120,17 @@ Comment = r'#[^\r\n]*'
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 Name = r'\w+'
 
-Hexnumber = r'0[xX][0-9a-fA-F]+'
-Binnumber = r'0[bB][01]+'
-Octnumber = r'0[oO][0-7]+'
-Decnumber = r'(?:0+|[1-9][0-9]*)'
+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
+Binnumber = r'0[bB](?:_?[01])+'
+Octnumber = r'0[oO](?:_?[0-7])+'
+Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?[0-9]+'
-Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
-Expfloat = r'[0-9]+' + Exponent
+Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
+Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
+                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
+Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
 Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
+Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
 Number = group(Imagnumber, Floatnumber, Intnumber)
 
 # Return the empty string, plus all of the valid string prefixes.
author	Brett Cannon <brett@python.org>	2016-09-09 14:57:09 -0700
committer	Brett Cannon <brett@python.org>	2016-09-09 14:57:09 -0700
commit	a721abac299bb6529021000a71847486d531b41a (patch)
tree	8355a69b891cfcdaad8a5fd62870231b7f940696 /Lib
parent	Merge heads (diff)
download	cpython-a721abac299bb6529021000a71847486d531b41a.tar.gz cpython-a721abac299bb6529021000a71847486d531b41a.tar.bz2 cpython-a721abac299bb6529021000a71847486d531b41a.zip