diff --git a/lib/tokenize_rt.py b/lib/tokenize_rt.py new file mode 100644 index 00000000..debbd45f --- /dev/null +++ b/lib/tokenize_rt.py @@ -0,0 +1,217 @@ +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import collections +import io +import keyword +import re +import tokenize +from typing import Generator +from typing import Iterable +from typing import List +from typing import Optional +from typing import Pattern +from typing import Sequence +from typing import Tuple + + +ESCAPED_NL = 'ESCAPED_NL' +UNIMPORTANT_WS = 'UNIMPORTANT_WS' +NON_CODING_TOKENS = frozenset(('COMMENT', ESCAPED_NL, 'NL', UNIMPORTANT_WS)) + + +class Offset(collections.namedtuple('Offset', ('line', 'utf8_byte_offset'))): + __slots__ = () + + def __new__(cls, line=None, utf8_byte_offset=None): + # type: (Optional[int], Optional[int]) -> None + return super(Offset, cls).__new__(cls, line, utf8_byte_offset) + + +class Token( + collections.namedtuple( + 'Token', ('name', 'src', 'line', 'utf8_byte_offset'), + ), +): + __slots__ = () + + def __new__(cls, name, src, line=None, utf8_byte_offset=None): + # type: (str, str, Optional[int], Optional[int]) -> None + return super(Token, cls).__new__( + cls, name, src, line, utf8_byte_offset, + ) + + @property + def offset(self): # type: () -> Offset + return Offset(self.line, self.utf8_byte_offset) + + +_string_re = re.compile('^([^\'"]*)(.*)$', re.DOTALL) +_string_prefixes = frozenset('bfru') +_escaped_nl_re = re.compile(r'\\(\n|\r\n|\r)') + + +def _re_partition(regex, s): + # type: (Pattern[str], str) -> Tuple[str, str, str] + match = regex.search(s) + if match: + return s[:match.start()], s[slice(*match.span())], s[match.end():] + else: + return (s, '', '') + + +def src_to_tokens(src): # type: (str) -> List[Token] + tokenize_target = io.StringIO(src) + lines = ('',) + tuple(tokenize_target) + + tokenize_target.seek(0) + + tokens = [] + last_line = 1 + last_col = 0 + + for ( + tok_type, tok_text, (sline, scol), (eline, ecol), line, + ) in tokenize.generate_tokens(tokenize_target.readline): + if sline > last_line: + newtok = lines[last_line][last_col:] + for lineno in range(last_line + 1, sline): + newtok += lines[lineno] + if scol > 0: + newtok += lines[sline][:scol] + + # a multiline unimportant whitespace may contain escaped newlines + while _escaped_nl_re.search(newtok): + ws, nl, newtok = _re_partition(_escaped_nl_re, newtok) + if ws: + tokens.append(Token(UNIMPORTANT_WS, ws)) + tokens.append(Token(ESCAPED_NL, nl)) + if newtok: + tokens.append(Token(UNIMPORTANT_WS, newtok)) + + elif scol > last_col: + tokens.append(Token(UNIMPORTANT_WS, line[last_col:scol])) + + tok_name = tokenize.tok_name[tok_type] + utf8_byte_offset = len(line[:scol].encode('UTF-8')) + # when a string prefix is not recognized, the tokenizer produces a + # NAME token followed by a STRING token + if ( + tok_name == 'STRING' and + tokens and + tokens[-1].name == 'NAME' and + frozenset(tokens[-1].src.lower()) <= _string_prefixes + ): + newsrc = tokens[-1].src + tok_text + tokens[-1] = tokens[-1]._replace(src=newsrc, name=tok_name) + # produce octal literals as a single token in python 3 as well + elif ( + tok_name == 'NUMBER' and + tokens and + tokens[-1].name == 'NUMBER' + ): # pragma: no cover (PY3) + tokens[-1] = tokens[-1]._replace(src=tokens[-1].src + tok_text) + # produce long literals as a single token in python 3 as well + elif ( + tok_name == 'NAME' and + tok_text.lower() == 'l' and + tokens and + tokens[-1].name == 'NUMBER' + ): # pragma: no cover (PY3) + tokens[-1] = tokens[-1]._replace(src=tokens[-1].src + tok_text) + else: + tokens.append(Token(tok_name, tok_text, sline, utf8_byte_offset)) + last_line, last_col = eline, ecol + + return tokens + + +def tokens_to_src(tokens): # type: (Iterable[Token]) -> str + return ''.join(tok.src for tok in tokens) + + +def reversed_enumerate(tokens): + # type: (Sequence[Token]) -> Generator[Tuple[int, Token], None, None] + for i in reversed(range(len(tokens))): + yield i, tokens[i] + + +def parse_string_literal(src): # type: (str) -> Tuple[str, str] + """parse a string literal's source into (prefix, string)""" + match = _string_re.match(src) + assert match is not None + return match.group(1), match.group(2) + + +def rfind_string_parts(tokens, i): + # type: (Sequence[Token], int) -> Tuple[int, ...] + """find the indicies of the string parts of a (joined) string literal + + - `i` should start at the end of the string literal + - returns `()` (an empty tuple) for things which are not string literals + """ + ret = [] + depth = 0 + for i in range(i, -1, -1): + token = tokens[i] + if token.name == 'STRING': + ret.append(i) + elif token.name in NON_CODING_TOKENS: + pass + elif token.src == ')': + depth += 1 + elif depth and token.src == '(': + depth -= 1 + # if we closed the paren(s) make sure it was a parenthesized string + # and not actually a call + if depth == 0: + for j in range(i - 1, -1, -1): + tok = tokens[j] + if tok.name in NON_CODING_TOKENS: + pass + # this was actually a call and not a parenthesized string + elif ( + tok.src in {']', ')'} or ( + tok.name == 'NAME' and + tok.src not in keyword.kwlist + ) + ): + return () + else: + break + break + elif depth: # it looked like a string but wasn't + return () + else: + break + return tuple(reversed(ret)) + + +def main(argv=None): # type: (Optional[Sequence[str]]) -> int + parser = argparse.ArgumentParser() + parser.add_argument('filename') + args = parser.parse_args(argv) + with io.open(args.filename) as f: + tokens = src_to_tokens(f.read()) + + def no_u_repr(s): # type: (str) -> str + return repr(s).lstrip('u') + + for token in tokens: + if token.name == UNIMPORTANT_WS: + line, col = '?', '?' + else: + line, col = token.line, token.utf8_byte_offset + print( + '{}:{} {} {}'.format( + line, col, token.name, no_u_repr(token.src), + ), + ) + + return 0 + + +if __name__ == '__main__': + exit(main())