import re
import json
import sys
from dataclasses import dataclass
def json_simple():
# A simple chunk of JSON with various data types.
d = dict(
msg = "hello world",
n = 99.34,
status = True,
other = None,
)
return json.dumps(d, indent = 4)
def good_tokens_bad_json():
# The lexer should accept this. Let the parser reject it.
return ''' true null "hi" } 123 { '''
def json_quoting_example():
# Some strings with internal double-quotes and backslashes.
examples = (
# Just quotes.
r'__"foo"__',
r'__"foo__',
r'__foo"__',
# Quotes with leading backslashes.
r'__\"foo\"__',
r'__\"foo__',
r'__foo\"__',
# Quotes with 2 leading backslashes.
r'__\\"foo\\"__',
r'__\\"foo__',
r'__foo\\"__',
)
# Convert those examples into a dict and then JSON text.
d = {
f'ex{i}' : ex
for i, ex in enumerate(examples)
}
return json.dumps(d, indent = 4)
def invalid_text():
return '''{"a": 123, blort: 99}'''
EXAMPLES = dict(
quoting = json_quoting_example(),
goodbad = good_tokens_bad_json(),
simple = json_simple(),
invalid = invalid_text(),
)
def main():
args = sys.argv[1:] + ['simple']
k = args[0]
text = EXAMPLES[k]
lex = Lexer(text)
lex.lex()
for tok in lex.tokens:
print(tok)
EOF = 'EOF'
TOKEN_DEFINITIONS = [
# Try to match these bigger concepts first.
('QUOTED_STRING', r'".*?(?<!\\)"'),
('NUMBER', r'-?(?:0|[1-9]\d*)(?:\.\d+)?(?:e[-+]\d+)?'),
# Everything else is atomic and simple, so no confusion to worry about.
('OPEN_BRACE', r'{'),
('CLOSE_BRACE', r'}'),
('OPEN_BRACKET', r'\['),
('CLOSE_BRACKET', r'\]'),
('COMMA', r','),
('WHITESPACE', r'\s+'),
('COLON', r':'),
('NULL', r'null'),
('TRUE', r'true'),
('FALSE', r'false'),
(EOF, r'$')
]
TOKEN_DEFINITIONS = [
(type, re.compile(pattern))
for type, pattern in TOKEN_DEFINITIONS
]
@dataclass(frozen = True)
class Token:
pos: int
type: str
value: str
class Lexer:
def __init__(self, text):
self.text = text
self.pos = 0
self.tokens = []
def lex(self):
while True:
tok = self.get_next_token()
self.tokens.append(tok)
if tok.type == EOF:
break
def get_next_token(self):
for tok_type, rgx in TOKEN_DEFINITIONS:
m = rgx.match(self.text, pos = self.pos)
if m:
value = m.group()
self.pos += len(value)
return Token(self.pos, tok_type, value)
chunk = self.text[self.pos : self.pos + 20]
msg = f'Unrecognized token: position={self.pos} content={chunk!r}'
raise ValueError(msg)
if __name__ == '__main__':
main()
Postscript: my regular expression for quoted strings also bad. It fails
when the string actually ends with a backslash. I believe that this
answer shows how to do it
correctly (but I did not test it extensively, which one really must do this
these sorts of things). Here are the components of the regular expression
broken down and applied to your situation (double-quoted strings only):
" # Opening quote.
(
(\\{2})* # Just an even N of backslashes.
| # OR ...
(
.*? # Stuff, non-greedy.
[^\\] # Non backslash.
(\\{2})* # Even N of backslashes.
)
)
" # Closing quote.