mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-06 05:01:14 -07:00
Add tokenize_rt-3.2.0
This commit is contained in:
parent
d460263b97
commit
2dcc74d82d
1 changed files with 217 additions and 0 deletions
217
lib/tokenize_rt.py
Normal file
217
lib/tokenize_rt.py
Normal file
|
@ -0,0 +1,217 @@
|
|||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import argparse
|
||||
import collections
|
||||
import io
|
||||
import keyword
|
||||
import re
|
||||
import tokenize
|
||||
from typing import Generator
|
||||
from typing import Iterable
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
from typing import Pattern
|
||||
from typing import Sequence
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
ESCAPED_NL = 'ESCAPED_NL'
|
||||
UNIMPORTANT_WS = 'UNIMPORTANT_WS'
|
||||
NON_CODING_TOKENS = frozenset(('COMMENT', ESCAPED_NL, 'NL', UNIMPORTANT_WS))
|
||||
|
||||
|
||||
class Offset(collections.namedtuple('Offset', ('line', 'utf8_byte_offset'))):
|
||||
__slots__ = ()
|
||||
|
||||
def __new__(cls, line=None, utf8_byte_offset=None):
|
||||
# type: (Optional[int], Optional[int]) -> None
|
||||
return super(Offset, cls).__new__(cls, line, utf8_byte_offset)
|
||||
|
||||
|
||||
class Token(
|
||||
collections.namedtuple(
|
||||
'Token', ('name', 'src', 'line', 'utf8_byte_offset'),
|
||||
),
|
||||
):
|
||||
__slots__ = ()
|
||||
|
||||
def __new__(cls, name, src, line=None, utf8_byte_offset=None):
|
||||
# type: (str, str, Optional[int], Optional[int]) -> None
|
||||
return super(Token, cls).__new__(
|
||||
cls, name, src, line, utf8_byte_offset,
|
||||
)
|
||||
|
||||
@property
|
||||
def offset(self): # type: () -> Offset
|
||||
return Offset(self.line, self.utf8_byte_offset)
|
||||
|
||||
|
||||
_string_re = re.compile('^([^\'"]*)(.*)$', re.DOTALL)
|
||||
_string_prefixes = frozenset('bfru')
|
||||
_escaped_nl_re = re.compile(r'\\(\n|\r\n|\r)')
|
||||
|
||||
|
||||
def _re_partition(regex, s):
|
||||
# type: (Pattern[str], str) -> Tuple[str, str, str]
|
||||
match = regex.search(s)
|
||||
if match:
|
||||
return s[:match.start()], s[slice(*match.span())], s[match.end():]
|
||||
else:
|
||||
return (s, '', '')
|
||||
|
||||
|
||||
def src_to_tokens(src): # type: (str) -> List[Token]
|
||||
tokenize_target = io.StringIO(src)
|
||||
lines = ('',) + tuple(tokenize_target)
|
||||
|
||||
tokenize_target.seek(0)
|
||||
|
||||
tokens = []
|
||||
last_line = 1
|
||||
last_col = 0
|
||||
|
||||
for (
|
||||
tok_type, tok_text, (sline, scol), (eline, ecol), line,
|
||||
) in tokenize.generate_tokens(tokenize_target.readline):
|
||||
if sline > last_line:
|
||||
newtok = lines[last_line][last_col:]
|
||||
for lineno in range(last_line + 1, sline):
|
||||
newtok += lines[lineno]
|
||||
if scol > 0:
|
||||
newtok += lines[sline][:scol]
|
||||
|
||||
# a multiline unimportant whitespace may contain escaped newlines
|
||||
while _escaped_nl_re.search(newtok):
|
||||
ws, nl, newtok = _re_partition(_escaped_nl_re, newtok)
|
||||
if ws:
|
||||
tokens.append(Token(UNIMPORTANT_WS, ws))
|
||||
tokens.append(Token(ESCAPED_NL, nl))
|
||||
if newtok:
|
||||
tokens.append(Token(UNIMPORTANT_WS, newtok))
|
||||
|
||||
elif scol > last_col:
|
||||
tokens.append(Token(UNIMPORTANT_WS, line[last_col:scol]))
|
||||
|
||||
tok_name = tokenize.tok_name[tok_type]
|
||||
utf8_byte_offset = len(line[:scol].encode('UTF-8'))
|
||||
# when a string prefix is not recognized, the tokenizer produces a
|
||||
# NAME token followed by a STRING token
|
||||
if (
|
||||
tok_name == 'STRING' and
|
||||
tokens and
|
||||
tokens[-1].name == 'NAME' and
|
||||
frozenset(tokens[-1].src.lower()) <= _string_prefixes
|
||||
):
|
||||
newsrc = tokens[-1].src + tok_text
|
||||
tokens[-1] = tokens[-1]._replace(src=newsrc, name=tok_name)
|
||||
# produce octal literals as a single token in python 3 as well
|
||||
elif (
|
||||
tok_name == 'NUMBER' and
|
||||
tokens and
|
||||
tokens[-1].name == 'NUMBER'
|
||||
): # pragma: no cover (PY3)
|
||||
tokens[-1] = tokens[-1]._replace(src=tokens[-1].src + tok_text)
|
||||
# produce long literals as a single token in python 3 as well
|
||||
elif (
|
||||
tok_name == 'NAME' and
|
||||
tok_text.lower() == 'l' and
|
||||
tokens and
|
||||
tokens[-1].name == 'NUMBER'
|
||||
): # pragma: no cover (PY3)
|
||||
tokens[-1] = tokens[-1]._replace(src=tokens[-1].src + tok_text)
|
||||
else:
|
||||
tokens.append(Token(tok_name, tok_text, sline, utf8_byte_offset))
|
||||
last_line, last_col = eline, ecol
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def tokens_to_src(tokens): # type: (Iterable[Token]) -> str
|
||||
return ''.join(tok.src for tok in tokens)
|
||||
|
||||
|
||||
def reversed_enumerate(tokens):
|
||||
# type: (Sequence[Token]) -> Generator[Tuple[int, Token], None, None]
|
||||
for i in reversed(range(len(tokens))):
|
||||
yield i, tokens[i]
|
||||
|
||||
|
||||
def parse_string_literal(src): # type: (str) -> Tuple[str, str]
|
||||
"""parse a string literal's source into (prefix, string)"""
|
||||
match = _string_re.match(src)
|
||||
assert match is not None
|
||||
return match.group(1), match.group(2)
|
||||
|
||||
|
||||
def rfind_string_parts(tokens, i):
|
||||
# type: (Sequence[Token], int) -> Tuple[int, ...]
|
||||
"""find the indicies of the string parts of a (joined) string literal
|
||||
|
||||
- `i` should start at the end of the string literal
|
||||
- returns `()` (an empty tuple) for things which are not string literals
|
||||
"""
|
||||
ret = []
|
||||
depth = 0
|
||||
for i in range(i, -1, -1):
|
||||
token = tokens[i]
|
||||
if token.name == 'STRING':
|
||||
ret.append(i)
|
||||
elif token.name in NON_CODING_TOKENS:
|
||||
pass
|
||||
elif token.src == ')':
|
||||
depth += 1
|
||||
elif depth and token.src == '(':
|
||||
depth -= 1
|
||||
# if we closed the paren(s) make sure it was a parenthesized string
|
||||
# and not actually a call
|
||||
if depth == 0:
|
||||
for j in range(i - 1, -1, -1):
|
||||
tok = tokens[j]
|
||||
if tok.name in NON_CODING_TOKENS:
|
||||
pass
|
||||
# this was actually a call and not a parenthesized string
|
||||
elif (
|
||||
tok.src in {']', ')'} or (
|
||||
tok.name == 'NAME' and
|
||||
tok.src not in keyword.kwlist
|
||||
)
|
||||
):
|
||||
return ()
|
||||
else:
|
||||
break
|
||||
break
|
||||
elif depth: # it looked like a string but wasn't
|
||||
return ()
|
||||
else:
|
||||
break
|
||||
return tuple(reversed(ret))
|
||||
|
||||
|
||||
def main(argv=None): # type: (Optional[Sequence[str]]) -> int
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filename')
|
||||
args = parser.parse_args(argv)
|
||||
with io.open(args.filename) as f:
|
||||
tokens = src_to_tokens(f.read())
|
||||
|
||||
def no_u_repr(s): # type: (str) -> str
|
||||
return repr(s).lstrip('u')
|
||||
|
||||
for token in tokens:
|
||||
if token.name == UNIMPORTANT_WS:
|
||||
line, col = '?', '?'
|
||||
else:
|
||||
line, col = token.line, token.utf8_byte_offset
|
||||
print(
|
||||
'{}:{} {} {}'.format(
|
||||
line, col, token.name, no_u_repr(token.src),
|
||||
),
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(main())
|
Loading…
Add table
Add a link
Reference in a new issue