Revisions to JSON Lexer code - Code Review Stack Exchange

added 3 characters in body

Source Link

edited Dec 25, 2022 at 2:06

FMc

13.1k
2
29
40

Postscript: my regular expression for quoted strings is also bad. It fails when the string actually ends with a backslash. I believe that this answer shows how to do it correctly (but I did not test it extensively, which one really must do this these sorts of things). Here are the components of the regular expression broken down and applied to your situation (double-quoted strings only):

added 801 characters in body

Source Link

edited Dec 25, 2022 at 0:22

FMc

13.1k
2
29
40

import re
import json
import sys
from dataclasses import dataclass

def json_simple():
    # A simple chunk of JSON with various data types.
    d = dict(
        msg = "hello world",
        n = 99.34,
        status = True,
        other = None,
    )
    return json.dumps(d, indent = 4)

def good_tokens_bad_json():
    # The lexer should accept this. Let the parser reject it.
    return ''' true null "hi" } 123 {  '''

def json_quoting_example():
    # Some strings with internal double-quotes and backslashes.
    examples = (
        # Just quotes.
        r'__"foo"__',
        r'__"foo__',
        r'__foo"__',
        # Quotes with leading backslashes.
        r'__\"foo\"__',
        r'__\"foo__',
        r'__foo\"__',
        # Quotes with 2 leading backslashes.
        r'__\\"foo\\"__',
        r'__\\"foo__',
        r'__foo\\"__',
    )

    # Convert those examples into a dict and then JSON text.
    d = {
        f'ex{i}' : ex
        for i, ex in enumerate(examples)
    }
    return json.dumps(d, indent = 4)

def invalid_text():
    return '''{"a": 123, blort: 99}'''

EXAMPLES = dict(
    quoting = json_quoting_example(),
    goodbad = good_tokens_bad_json(),
    simple = json_simple(),
    invalid = invalid_text(),
)

def main():
    args = sys.argv[1:] + ['simple']
    k = args[0]
    text = EXAMPLES[k]
    lex = Lexer(text)
    lex.lex()
    for tok in lex.tokens:
        print(tok)

EOF = 'EOF'

TOKEN_DEFINITIONS = [
    # Try to match these bigger concepts first.
    ('QUOTED_STRING', r'".*?(?<!\\)"'),
    ('NUMBER', r'-?(?:0|[1-9]\d*)(?:\.\d+)?(?:e[-+]\d+)?'),
    # Everything else is atomic and simple, so no confusion to worry about.
    ('OPEN_BRACE', r'{'),
    ('CLOSE_BRACE', r'}'),
    ('OPEN_BRACKET', r'\['),
    ('CLOSE_BRACKET', r'\]'),
    ('COMMA', r','),
    ('WHITESPACE', r'\s+'),
    ('COLON', r':'),
    ('NULL', r'null'),
    ('TRUE', r'true'),
    ('FALSE', r'false'),
    (EOF, r'$')
]

TOKEN_DEFINITIONS = [
    (type, re.compile(pattern))
    for type, pattern in TOKEN_DEFINITIONS
]

@dataclass(frozen = True)
class Token:
    pos: int
    type: str
    value: str

class Lexer:

    def __init__(self, text):
        self.text = text
        self.pos = 0
        self.tokens = []

    def lex(self):
        while True:
            tok = self.get_next_token()
            self.tokens.append(tok)
            if tok.type == EOF:
                break

    def get_next_token(self):
        for tok_type, rgx in TOKEN_DEFINITIONS:
            m = rgx.match(self.text, pos = self.pos)
            if m:
                value = m.group()
                self.pos += len(value)
                return Token(self.pos, tok_type, value)

        chunk = self.text[self.pos : self.pos + 20]
        msg = f'Unrecognized token: position={self.pos} content={chunk!r}'
        raise ValueError(msg)

if __name__ == '__main__':
    main()

Postscript: my regular expression for quoted strings also bad. It fails when the string actually ends with a backslash. I believe that this answer shows how to do it correctly (but I did not test it extensively, which one really must do this these sorts of things). Here are the components of the regular expression broken down and applied to your situation (double-quoted strings only):

"                      # Opening quote.
(
    (\\{2})*           # Just an even N of backslashes.
    |                  # OR ...
    (
        .*?            # Stuff, non-greedy.
        [^\\]          # Non backslash.
        (\\{2})*       # Even N of backslashes.
    )
)
"                      # Closing quote.

import re
import json
import sys
from dataclasses import dataclass

def json_simple():
    # A simple chunk of JSON with various data types.
    d = dict(
        msg = "hello world",
        n = 99.34,
        status = True,
        other = None,
    )
    return json.dumps(d, indent = 4)

def good_tokens_bad_json():
    # The lexer should accept this. Let the parser reject it.
    return ''' true null "hi" } 123 {  '''

def json_quoting_example():
    # Some strings with internal double-quotes and backslashes.
    examples = (
        # Just quotes.
        r'__"foo"__',
        r'__"foo__',
        r'__foo"__',
        # Quotes with leading backslashes.
        r'__\"foo\"__',
        r'__\"foo__',
        r'__foo\"__',
        # Quotes with 2 leading backslashes.
        r'__\\"foo\\"__',
        r'__\\"foo__',
        r'__foo\\"__',
    )

    # Convert those examples into a dict and then JSON text.
    d = {
        f'ex{i}' : ex
        for i, ex in enumerate(examples)
    }
    return json.dumps(d, indent = 4)

def invalid_text():
    return '''{"a": 123, blort: 99}'''

EXAMPLES = dict(
    quoting = json_quoting_example(),
    goodbad = good_tokens_bad_json(),
    simple = json_simple(),
    invalid = invalid_text(),
)

def main():
    args = sys.argv[1:] + ['simple']
    k = args[0]
    text = EXAMPLES[k]
    lex = Lexer(text)
    lex.lex()
    for tok in lex.tokens:
        print(tok)

EOF = 'EOF'

TOKEN_DEFINITIONS = [
    # Try to match these bigger concepts first.
    ('QUOTED_STRING', r'".*?(?<!\\)"'),
    ('NUMBER', r'-?(?:0|[1-9]\d*)(?:\.\d+)?(?:e[-+]\d+)?'),
    # Everything else is atomic and simple, so no confusion to worry about.
    ('OPEN_BRACE', r'{'),
    ('CLOSE_BRACE', r'}'),
    ('OPEN_BRACKET', r'\['),
    ('CLOSE_BRACKET', r'\]'),
    ('COMMA', r','),
    ('WHITESPACE', r'\s+'),
    ('COLON', r':'),
    ('NULL', r'null'),
    ('TRUE', r'true'),
    ('FALSE', r'false'),
    (EOF, r'$')
]

TOKEN_DEFINITIONS = [
    (type, re.compile(pattern))
    for type, pattern in TOKEN_DEFINITIONS
]

@dataclass(frozen = True)
class Token:
    pos: int
    type: str
    value: str

class Lexer:

    def __init__(self, text):
        self.text = text
        self.pos = 0
        self.tokens = []

    def lex(self):
        while True:
            tok = self.get_next_token()
            self.tokens.append(tok)
            if tok.type == EOF:
                break

    def get_next_token(self):
        for tok_type, rgx in TOKEN_DEFINITIONS:
            m = rgx.match(self.text, pos = self.pos)
            if m:
                value = m.group()
                self.pos += len(value)
                return Token(self.pos, tok_type, value)

        chunk = self.text[self.pos : self.pos + 20]
        msg = f'Unrecognized token: position={self.pos} content={chunk!r}'
        raise ValueError(msg)

if __name__ == '__main__':
    main()

import re
import json
import sys
from dataclasses import dataclass

def json_simple():
    # A simple chunk of JSON with various data types.
    d = dict(
        msg = "hello world",
        n = 99.34,
        status = True,
        other = None,
    )
    return json.dumps(d, indent = 4)

def good_tokens_bad_json():
    # The lexer should accept this. Let the parser reject it.
    return ''' true null "hi" } 123 {  '''

def json_quoting_example():
    # Some strings with internal double-quotes and backslashes.
    examples = (
        # Just quotes.
        r'__"foo"__',
        r'__"foo__',
        r'__foo"__',
        # Quotes with leading backslashes.
        r'__\"foo\"__',
        r'__\"foo__',
        r'__foo\"__',
        # Quotes with 2 leading backslashes.
        r'__\\"foo\\"__',
        r'__\\"foo__',
        r'__foo\\"__',
    )

    # Convert those examples into a dict and then JSON text.
    d = {
        f'ex{i}' : ex
        for i, ex in enumerate(examples)
    }
    return json.dumps(d, indent = 4)

def invalid_text():
    return '''{"a": 123, blort: 99}'''

EXAMPLES = dict(
    quoting = json_quoting_example(),
    goodbad = good_tokens_bad_json(),
    simple = json_simple(),
    invalid = invalid_text(),
)

def main():
    args = sys.argv[1:] + ['simple']
    k = args[0]
    text = EXAMPLES[k]
    lex = Lexer(text)
    lex.lex()
    for tok in lex.tokens:
        print(tok)

EOF = 'EOF'

TOKEN_DEFINITIONS = [
    # Try to match these bigger concepts first.
    ('QUOTED_STRING', r'".*?(?<!\\)"'),
    ('NUMBER', r'-?(?:0|[1-9]\d*)(?:\.\d+)?(?:e[-+]\d+)?'),
    # Everything else is atomic and simple, so no confusion to worry about.
    ('OPEN_BRACE', r'{'),
    ('CLOSE_BRACE', r'}'),
    ('OPEN_BRACKET', r'\['),
    ('CLOSE_BRACKET', r'\]'),
    ('COMMA', r','),
    ('WHITESPACE', r'\s+'),
    ('COLON', r':'),
    ('NULL', r'null'),
    ('TRUE', r'true'),
    ('FALSE', r'false'),
    (EOF, r'$')
]

TOKEN_DEFINITIONS = [
    (type, re.compile(pattern))
    for type, pattern in TOKEN_DEFINITIONS
]

@dataclass(frozen = True)
class Token:
    pos: int
    type: str
    value: str

class Lexer:

    def __init__(self, text):
        self.text = text
        self.pos = 0
        self.tokens = []

    def lex(self):
        while True:
            tok = self.get_next_token()
            self.tokens.append(tok)
            if tok.type == EOF:
                break

    def get_next_token(self):
        for tok_type, rgx in TOKEN_DEFINITIONS:
            m = rgx.match(self.text, pos = self.pos)
            if m:
                value = m.group()
                self.pos += len(value)
                return Token(self.pos, tok_type, value)

        chunk = self.text[self.pos : self.pos + 20]
        msg = f'Unrecognized token: position={self.pos} content={chunk!r}'
        raise ValueError(msg)

if __name__ == '__main__':
    main()

Postscript: my regular expression for quoted strings also bad. It fails when the string actually ends with a backslash. I believe that this answer shows how to do it correctly (but I did not test it extensively, which one really must do this these sorts of things). Here are the components of the regular expression broken down and applied to your situation (double-quoted strings only):

"                      # Opening quote.
(
    (\\{2})*           # Just an even N of backslashes.
    |                  # OR ...
    (
        .*?            # Stuff, non-greedy.
        [^\\]          # Non backslash.
        (\\{2})*       # Even N of backslashes.
    )
)
"                      # Closing quote.

deleted 4 characters in body

Source Link

edited Dec 24, 2022 at 20:42

FMc

13.1k
2
29
40

Token-defintion order matters. When lexing you have to take special care to attempt to match the token definitions in an order that will avoid confusion, which can occur if one definition embraces a simpler definition (for example, a quoted string can contain lots of other stuff). One strategy to avoid such problems it to attempt the "bigger" entities first. Even though I found no specific problems along these lines in your lexer, on general principle I rearranged the the ordering of the token definitions.

deleted 6 characters in body

Source Link

edited Dec 24, 2022 at 20:35

FMc

13.1k
2
29
40

Loading

deleted 65 characters in body

Source Link

edited Dec 24, 2022 at 20:30

FMc

13.1k
2
29
40

Loading

Source Link

answered Dec 24, 2022 at 20:21

FMc

13.1k
2
29
40

Loading

Stack Exchange Network

Return to Answer