aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2018-12-22 11:18:40 +0200
committerGitHub <noreply@github.com>2018-12-22 11:18:40 +0200
commit8ac658114dec4964479baecfbc439fceb40eaa79 (patch)
treee66c4c3beda293a6fdf01763306697d15d0af157 /Lib/tokenize.py
parent bpo-22703: IDLE: Improve Code Context and Zoom Height menu labels (GH-11214) (diff)
downloadcpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.gz
cpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.bz2
cpython-8ac658114dec4964479baecfbc439fceb40eaa79.zip
bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)
"Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens.
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r--Lib/tokenize.py66
1 files changed, 6 insertions, 60 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index fce010bc5e7..cf1ecc99a94 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -32,6 +32,7 @@ import itertools as _itertools
import re
import sys
from token import *
+from token import EXACT_TOKEN_TYPES
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -41,55 +42,6 @@ __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
"untokenize", "TokenInfo"]
del token
-EXACT_TOKEN_TYPES = {
- '(': LPAR,
- ')': RPAR,
- '[': LSQB,
- ']': RSQB,
- ':': COLON,
- ',': COMMA,
- ';': SEMI,
- '+': PLUS,
- '-': MINUS,
- '*': STAR,
- '/': SLASH,
- '|': VBAR,
- '&': AMPER,
- '<': LESS,
- '>': GREATER,
- '=': EQUAL,
- '.': DOT,
- '%': PERCENT,
- '{': LBRACE,
- '}': RBRACE,
- '==': EQEQUAL,
- '!=': NOTEQUAL,
- '<=': LESSEQUAL,
- '>=': GREATEREQUAL,
- '~': TILDE,
- '^': CIRCUMFLEX,
- '<<': LEFTSHIFT,
- '>>': RIGHTSHIFT,
- '**': DOUBLESTAR,
- '+=': PLUSEQUAL,
- '-=': MINEQUAL,
- '*=': STAREQUAL,
- '/=': SLASHEQUAL,
- '%=': PERCENTEQUAL,
- '&=': AMPEREQUAL,
- '|=': VBAREQUAL,
- '^=': CIRCUMFLEXEQUAL,
- '<<=': LEFTSHIFTEQUAL,
- '>>=': RIGHTSHIFTEQUAL,
- '**=': DOUBLESTAREQUAL,
- '//': DOUBLESLASH,
- '//=': DOUBLESLASHEQUAL,
- '...': ELLIPSIS,
- '->': RARROW,
- '@': AT,
- '@=': ATEQUAL,
-}
-
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self):
annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
@@ -163,17 +115,11 @@ Triple = group(StringPrefix + "'''", StringPrefix + '"""')
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
- r"//=?", r"->",
- r"[+\-*/%&@|^=<>]=?",
- r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-Funny = group(Operator, Bracket, Special)
+# Sorting in reverse order puts the long operators before their prefixes.
+# Otherwise if = came before ==, == would get recognized as two instances
+# of =.
+Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
+Funny = group(r'\r?\n', Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken