Skip to main content
added 3 characters in body
Source Link
FMc
  • 13.1k
  • 2
  • 29
  • 40

Postscript: my regular expression for quoted strings is also bad. It fails when the string actually ends with a backslash. I believe that this answer shows how to do it correctly (but I did not test it extensively, which one really must do this these sorts of things). Here are the components of the regular expression broken down and applied to your situation (double-quoted strings only):

Postscript: my regular expression for quoted strings also bad. It fails when the string actually ends with a backslash. I believe that this answer shows how to do it correctly (but I did not test it extensively, which one really must do this these sorts of things). Here are the components of the regular expression broken down and applied to your situation (double-quoted strings only):

Postscript: my regular expression for quoted strings is also bad. It fails when the string actually ends with a backslash. I believe that this answer shows how to do it correctly (but I did not test it extensively, which one really must do this these sorts of things). Here are the components of the regular expression broken down and applied to your situation (double-quoted strings only):

added 801 characters in body
Source Link
FMc
  • 13.1k
  • 2
  • 29
  • 40
import re
import json
import sys
from dataclasses import dataclass

def json_simple():
    # A simple chunk of JSON with various data types.
    d = dict(
        msg = "hello world",
        n = 99.34,
        status = True,
        other = None,
    )
    return json.dumps(d, indent = 4)

def good_tokens_bad_json():
    # The lexer should accept this. Let the parser reject it.
    return ''' true null "hi" } 123 {  '''

def json_quoting_example():
    # Some strings with internal double-quotes and backslashes.
    examples = (
        # Just quotes.
        r'__"foo"__',
        r'__"foo__',
        r'__foo"__',
        # Quotes with leading backslashes.
        r'__\"foo\"__',
        r'__\"foo__',
        r'__foo\"__',
        # Quotes with 2 leading backslashes.
        r'__\\"foo\\"__',
        r'__\\"foo__',
        r'__foo\\"__',
    )

    # Convert those examples into a dict and then JSON text.
    d = {
        f'ex{i}' : ex
        for i, ex in enumerate(examples)
    }
    return json.dumps(d, indent = 4)

def invalid_text():
    return '''{"a": 123, blort: 99}'''

EXAMPLES = dict(
    quoting = json_quoting_example(),
    goodbad = good_tokens_bad_json(),
    simple = json_simple(),
    invalid = invalid_text(),
)

def main():
    args = sys.argv[1:] + ['simple']
    k = args[0]
    text = EXAMPLES[k]
    lex = Lexer(text)
    lex.lex()
    for tok in lex.tokens:
        print(tok)

EOF = 'EOF'

TOKEN_DEFINITIONS = [
    # Try to match these bigger concepts first.
    ('QUOTED_STRING', r'".*?(?<!\\)"'),
    ('NUMBER', r'-?(?:0|[1-9]\d*)(?:\.\d+)?(?:e[-+]\d+)?'),
    # Everything else is atomic and simple, so no confusion to worry about.
    ('OPEN_BRACE', r'{'),
    ('CLOSE_BRACE', r'}'),
    ('OPEN_BRACKET', r'\['),
    ('CLOSE_BRACKET', r'\]'),
    ('COMMA', r','),
    ('WHITESPACE', r'\s+'),
    ('COLON', r':'),
    ('NULL', r'null'),
    ('TRUE', r'true'),
    ('FALSE', r'false'),
    (EOF, r'$')
]

TOKEN_DEFINITIONS = [
    (type, re.compile(pattern))
    for type, pattern in TOKEN_DEFINITIONS
]

@dataclass(frozen = True)
class Token:
    pos: int
    type: str
    value: str

class Lexer:

    def __init__(self, text):
        self.text = text
        self.pos = 0
        self.tokens = []

    def lex(self):
        while True:
            tok = self.get_next_token()
            self.tokens.append(tok)
            if tok.type == EOF:
                break

    def get_next_token(self):
        for tok_type, rgx in TOKEN_DEFINITIONS:
            m = rgx.match(self.text, pos = self.pos)
            if m:
                value = m.group()
                self.pos += len(value)
                return Token(self.pos, tok_type, value)

        chunk = self.text[self.pos : self.pos + 20]
        msg = f'Unrecognized token: position={self.pos} content={chunk!r}'
        raise ValueError(msg)

if __name__ == '__main__':
    main()

 

Postscript: my regular expression for quoted strings also bad. It fails when the string actually ends with a backslash. I believe that this answer shows how to do it correctly (but I did not test it extensively, which one really must do this these sorts of things). Here are the components of the regular expression broken down and applied to your situation (double-quoted strings only):

"                      # Opening quote.
(
    (\\{2})*           # Just an even N of backslashes.
    |                  # OR ...
    (
        .*?            # Stuff, non-greedy.
        [^\\]          # Non backslash.
        (\\{2})*       # Even N of backslashes.
    )
)
"                      # Closing quote.
import re
import json
import sys
from dataclasses import dataclass

def json_simple():
    # A simple chunk of JSON with various data types.
    d = dict(
        msg = "hello world",
        n = 99.34,
        status = True,
        other = None,
    )
    return json.dumps(d, indent = 4)

def good_tokens_bad_json():
    # The lexer should accept this. Let the parser reject it.
    return ''' true null "hi" } 123 {  '''

def json_quoting_example():
    # Some strings with internal double-quotes and backslashes.
    examples = (
        # Just quotes.
        r'__"foo"__',
        r'__"foo__',
        r'__foo"__',
        # Quotes with leading backslashes.
        r'__\"foo\"__',
        r'__\"foo__',
        r'__foo\"__',
        # Quotes with 2 leading backslashes.
        r'__\\"foo\\"__',
        r'__\\"foo__',
        r'__foo\\"__',
    )

    # Convert those examples into a dict and then JSON text.
    d = {
        f'ex{i}' : ex
        for i, ex in enumerate(examples)
    }
    return json.dumps(d, indent = 4)

def invalid_text():
    return '''{"a": 123, blort: 99}'''

EXAMPLES = dict(
    quoting = json_quoting_example(),
    goodbad = good_tokens_bad_json(),
    simple = json_simple(),
    invalid = invalid_text(),
)

def main():
    args = sys.argv[1:] + ['simple']
    k = args[0]
    text = EXAMPLES[k]
    lex = Lexer(text)
    lex.lex()
    for tok in lex.tokens:
        print(tok)

EOF = 'EOF'

TOKEN_DEFINITIONS = [
    # Try to match these bigger concepts first.
    ('QUOTED_STRING', r'".*?(?<!\\)"'),
    ('NUMBER', r'-?(?:0|[1-9]\d*)(?:\.\d+)?(?:e[-+]\d+)?'),
    # Everything else is atomic and simple, so no confusion to worry about.
    ('OPEN_BRACE', r'{'),
    ('CLOSE_BRACE', r'}'),
    ('OPEN_BRACKET', r'\['),
    ('CLOSE_BRACKET', r'\]'),
    ('COMMA', r','),
    ('WHITESPACE', r'\s+'),
    ('COLON', r':'),
    ('NULL', r'null'),
    ('TRUE', r'true'),
    ('FALSE', r'false'),
    (EOF, r'$')
]

TOKEN_DEFINITIONS = [
    (type, re.compile(pattern))
    for type, pattern in TOKEN_DEFINITIONS
]

@dataclass(frozen = True)
class Token:
    pos: int
    type: str
    value: str

class Lexer:

    def __init__(self, text):
        self.text = text
        self.pos = 0
        self.tokens = []

    def lex(self):
        while True:
            tok = self.get_next_token()
            self.tokens.append(tok)
            if tok.type == EOF:
                break

    def get_next_token(self):
        for tok_type, rgx in TOKEN_DEFINITIONS:
            m = rgx.match(self.text, pos = self.pos)
            if m:
                value = m.group()
                self.pos += len(value)
                return Token(self.pos, tok_type, value)

        chunk = self.text[self.pos : self.pos + 20]
        msg = f'Unrecognized token: position={self.pos} content={chunk!r}'
        raise ValueError(msg)

if __name__ == '__main__':
    main()

 
import re
import json
import sys
from dataclasses import dataclass

def json_simple():
    # A simple chunk of JSON with various data types.
    d = dict(
        msg = "hello world",
        n = 99.34,
        status = True,
        other = None,
    )
    return json.dumps(d, indent = 4)

def good_tokens_bad_json():
    # The lexer should accept this. Let the parser reject it.
    return ''' true null "hi" } 123 {  '''

def json_quoting_example():
    # Some strings with internal double-quotes and backslashes.
    examples = (
        # Just quotes.
        r'__"foo"__',
        r'__"foo__',
        r'__foo"__',
        # Quotes with leading backslashes.
        r'__\"foo\"__',
        r'__\"foo__',
        r'__foo\"__',
        # Quotes with 2 leading backslashes.
        r'__\\"foo\\"__',
        r'__\\"foo__',
        r'__foo\\"__',
    )

    # Convert those examples into a dict and then JSON text.
    d = {
        f'ex{i}' : ex
        for i, ex in enumerate(examples)
    }
    return json.dumps(d, indent = 4)

def invalid_text():
    return '''{"a": 123, blort: 99}'''

EXAMPLES = dict(
    quoting = json_quoting_example(),
    goodbad = good_tokens_bad_json(),
    simple = json_simple(),
    invalid = invalid_text(),
)

def main():
    args = sys.argv[1:] + ['simple']
    k = args[0]
    text = EXAMPLES[k]
    lex = Lexer(text)
    lex.lex()
    for tok in lex.tokens:
        print(tok)

EOF = 'EOF'

TOKEN_DEFINITIONS = [
    # Try to match these bigger concepts first.
    ('QUOTED_STRING', r'".*?(?<!\\)"'),
    ('NUMBER', r'-?(?:0|[1-9]\d*)(?:\.\d+)?(?:e[-+]\d+)?'),
    # Everything else is atomic and simple, so no confusion to worry about.
    ('OPEN_BRACE', r'{'),
    ('CLOSE_BRACE', r'}'),
    ('OPEN_BRACKET', r'\['),
    ('CLOSE_BRACKET', r'\]'),
    ('COMMA', r','),
    ('WHITESPACE', r'\s+'),
    ('COLON', r':'),
    ('NULL', r'null'),
    ('TRUE', r'true'),
    ('FALSE', r'false'),
    (EOF, r'$')
]

TOKEN_DEFINITIONS = [
    (type, re.compile(pattern))
    for type, pattern in TOKEN_DEFINITIONS
]

@dataclass(frozen = True)
class Token:
    pos: int
    type: str
    value: str

class Lexer:

    def __init__(self, text):
        self.text = text
        self.pos = 0
        self.tokens = []

    def lex(self):
        while True:
            tok = self.get_next_token()
            self.tokens.append(tok)
            if tok.type == EOF:
                break

    def get_next_token(self):
        for tok_type, rgx in TOKEN_DEFINITIONS:
            m = rgx.match(self.text, pos = self.pos)
            if m:
                value = m.group()
                self.pos += len(value)
                return Token(self.pos, tok_type, value)

        chunk = self.text[self.pos : self.pos + 20]
        msg = f'Unrecognized token: position={self.pos} content={chunk!r}'
        raise ValueError(msg)

if __name__ == '__main__':
    main()

Postscript: my regular expression for quoted strings also bad. It fails when the string actually ends with a backslash. I believe that this answer shows how to do it correctly (but I did not test it extensively, which one really must do this these sorts of things). Here are the components of the regular expression broken down and applied to your situation (double-quoted strings only):

"                      # Opening quote.
(
    (\\{2})*           # Just an even N of backslashes.
    |                  # OR ...
    (
        .*?            # Stuff, non-greedy.
        [^\\]          # Non backslash.
        (\\{2})*       # Even N of backslashes.
    )
)
"                      # Closing quote.
deleted 4 characters in body
Source Link
FMc
  • 13.1k
  • 2
  • 29
  • 40

Token-defintion order matters. When lexing you have to take special care to attempt to match the token definitions in an order that will avoid confusion, which can occur if one definition embraces a simpler definition (for example, a quoted string can contain lots of other stuff). One strategy to avoid such problems it to attempt the "bigger" entities first. Even though I found no specific problems along these lines in your lexer, on general principle I rearranged the the ordering of the token definitions.

Token-defintion order matters. When lexing you have to take special care to attempt to match the token definitions in an order that will avoid confusion, which can occur if one definition embraces a simpler definition (for example, a quoted string can contain lots of other stuff). One strategy to avoid such problems it to attempt the "bigger" entities first. Even though I found no specific problems along these lines in your lexer, on general principle I rearranged the the ordering of the token definitions.

Token-defintion order matters. When lexing you have to take special care to attempt to match the token definitions in an order that will avoid confusion, which can occur if one definition embraces a simpler definition (for example, a quoted string can contain lots of other stuff). One strategy to avoid such problems it to attempt the "bigger" entities first. Even though I found no specific problems along these lines in your lexer, on general principle I rearranged the ordering of the token definitions.

deleted 6 characters in body
Source Link
FMc
  • 13.1k
  • 2
  • 29
  • 40
Loading
deleted 65 characters in body
Source Link
FMc
  • 13.1k
  • 2
  • 29
  • 40
Loading
Source Link
FMc
  • 13.1k
  • 2
  • 29
  • 40
Loading