-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathreader_text.py
2323 lines (2062 loc) · 98.5 KB
/
reader_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at:
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
# OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the
# License.
import base64
from decimal import Decimal
from collections import defaultdict
from enum import IntEnum
from functools import partial
from typing import Optional, NamedTuple
from amazon.ion.core import Transition, ION_STREAM_INCOMPLETE_EVENT, ION_STREAM_END_EVENT, IonType, IonEvent, \
IonEventType, IonThunkEvent, TimestampPrecision, timestamp, ION_VERSION_MARKER_EVENT
from amazon.ion.exceptions import IonException
from amazon.ion.reader import BufferQueue, reader_trampoline, ReadEventType, CodePointArray, CodePoint
from amazon.ion.symbols import SymbolToken, TEXT_ION_1_0
from amazon.ion.util import coroutine, _next_code_point, CodePoint
def _illegal_character(c, ctx, message=''):
"""Raises an IonException upon encountering the given illegal character in the given context.
Args:
c (int|None): Ordinal of the illegal character.
ctx (_HandlerContext): Context in which the illegal character was encountered.
message (Optional[str]): Additional information, as necessary.
"""
container_type = ctx.container.ion_type is None and 'top-level' or ctx.container.ion_type.name
value_type = ctx.ion_type is None and 'unknown' or ctx.ion_type.name
if c is None:
header = 'Illegal token'
else:
c = 'EOF' if BufferQueue.is_eof(c) else chr(c)
header = 'Illegal character %s' % (c,)
raise IonException('%s at position %d in %s value contained in %s. %s Pending value: %s'
% (header, ctx.queue.position, value_type, container_type, message, ctx.value))
def _defaultdict(dct, fallback=_illegal_character):
"""Wraps the given dictionary such that the given fallback function will be called when a nonexistent key is
accessed.
"""
out = defaultdict(lambda: fallback)
for k, v in iter(dct.items()):
out[k] = v
return out
def _merge_mappings(*args):
"""Merges a sequence of dictionaries and/or tuples into a single dictionary.
If a given argument is a tuple, it must have two elements, the first of which is a sequence of keys and the second
of which is a single value, which will be mapped to from each of the keys in the sequence.
"""
dct = {}
for arg in args:
if isinstance(arg, dict):
merge = arg
else:
assert isinstance(arg, tuple)
keys, value = arg
merge = dict(zip(keys, [value]*len(keys)))
dct.update(merge)
return dct
def _seq(s):
"""Converts bytes to a sequence of integer code points."""
return tuple(iter(s))
_ENCODING = 'utf-8'
# NOTE: the following are stored as sequences of integer code points. This simplifies dealing with inconsistencies
# between how bytes objects are handled in python 2 and 3, and simplifies logic around comparing multi-byte characters.
_WHITESPACE_NOT_NL = _seq(b' \t\v\f')
_WHITESPACE = _WHITESPACE_NOT_NL + _seq(b'\n\r')
_VALUE_TERMINATORS = _seq(b'{}[](),\"\' \t\n\r/')
_SYMBOL_TOKEN_TERMINATORS = _WHITESPACE + _seq(b'/:')
_DIGITS = _seq(b'0123456789')
_BINARY_RADIX = _seq(b'Bb')
_BINARY_DIGITS = _seq(b'01')
_HEX_RADIX = _seq(b'Xx')
_HEX_DIGITS = _DIGITS + _seq(b'abcdefABCDEF')
_DECIMAL_EXPS = _seq(b'Dd')
_FLOAT_EXPS = _seq(b'Ee')
_SIGN = _seq(b'+-')
_TIMESTAMP_YEAR_DELIMITERS = _seq(b'-T')
_TIMESTAMP_DELIMITERS = _seq(b'-:+.')
_TIMESTAMP_OFFSET_INDICATORS = _seq(b'Z+-')
_LETTERS = _seq(b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
_BASE64_DIGITS = _LETTERS + _DIGITS + _seq(b'+/')
_IDENTIFIER_STARTS = _LETTERS + _seq(b'_') # Note: '$' is dealt with separately.
_IDENTIFIER_CHARACTERS = _IDENTIFIER_STARTS + _DIGITS + _seq(b'$')
_OPERATORS = _seq(b'!#%&*+-./;<=>?@^`|~')
_COMMON_ESCAPES = _seq(b'abtnfrv?0\'"/\\')
_NEWLINES = _seq(b'\r\n')
_UNDERSCORE = ord(b'_')
_DOT = ord(b'.')
_COMMA = ord(b',')
_COLON = ord(b':')
_SLASH = ord(b'/')
_ASTERISK = ord(b'*')
_BACKSLASH = ord(b'\\')
_CARRIAGE_RETURN = ord(b'\r')
_NEWLINE = ord(b'\n')
_DOUBLE_QUOTE = ord(b'"')
_SINGLE_QUOTE = ord(b'\'')
_DOLLAR_SIGN = ord(b'$')
_PLUS = ord(b'+')
_MINUS = ord(b'-')
_HYPHEN = _MINUS
_T = ord(b'T')
_Z = ord(b'Z')
_T_LOWER = ord(b't')
_N_LOWER = ord(b'n')
_F_LOWER = ord(b'f')
_ZERO = _DIGITS[0]
_OPEN_BRACE = ord(b'{')
_OPEN_BRACKET = ord(b'[')
_OPEN_PAREN = ord(b'(')
_CLOSE_BRACE = ord(b'}')
_CLOSE_BRACKET = ord(b']')
_CLOSE_PAREN = ord(b')')
_BASE64_PAD = ord(b'=')
_QUESTION_MARK = ord(b'?')
_UNICODE_ESCAPE_2 = ord(b'x')
_UNICODE_ESCAPE_4 = ord(b'u')
_UNICODE_ESCAPE_8 = ord(b'U')
_ESCAPED_NEWLINE = u'' # An escaped newline expands to nothing.
_MAX_TEXT_CHAR = 0x10ffff
_MAX_CLOB_CHAR = 0x7f
_MIN_QUOTED_CHAR = 0x20
# The following suffixes are used for comparison when a token is found that starts with the first letter in
# the keyword. For example, when a new token starts with 't', the next three characters must match those in
# _TRUE_SUFFIX, followed by an acceptable termination character, in order for the token to match the 'true' keyword.
_TRUE_SUFFIX = _seq(b'rue')
_FALSE_SUFFIX = _seq(b'alse')
_NAN_SUFFIX = _seq(b'an')
_INF_SUFFIX = _seq(b'inf')
_IVM_PREFIX = _seq(b'$ion_')
_IVM_EVENTS = {
TEXT_ION_1_0: ION_VERSION_MARKER_EVENT,
}
_POS_INF = float('+inf')
_NEG_INF = float('-inf')
_NAN = float('nan')
def _ends_value(c):
return c in _VALUE_TERMINATORS or BufferQueue.is_eof(c)
class _NullSequence:
"""Contains the terminal character sequence for the typed null suffix of the given IonType, starting with the first
character after the one which disambiguated the type.
For example, SYMBOL's _NullSequence contains the characters 'mbol' because 'null.s' is ambiguous until 'y' is found,
at which point it must end in 'mbol'.
Instances are used as leaves of the typed null prefix tree below.
"""
def __init__(self, ion_type, sequence):
self.ion_type = ion_type
self.sequence = sequence
def __getitem__(self, item):
return self.sequence[item]
_NULL_SUFFIX = _NullSequence(IonType.NULL, _seq(b'ull'))
_NULL_SYMBOL_SUFFIX = _NullSequence(IonType.SYMBOL, _seq(b'mbol'))
_NULL_SEXP_SUFFIX = _NullSequence(IonType.SEXP, _seq(b'xp'))
_NULL_STRING_SUFFIX = _NullSequence(IonType.STRING, _seq(b'ng'))
_NULL_STRUCT_SUFFIX = _NullSequence(IonType.STRUCT, _seq(b'ct'))
_NULL_INT_SUFFIX = _NullSequence(IonType.INT, _seq(b'nt'))
_NULL_FLOAT_SUFFIX = _NullSequence(IonType.FLOAT, _seq(b'loat'))
_NULL_DECIMAL_SUFFIX = _NullSequence(IonType.DECIMAL, _seq(b'ecimal'))
_NULL_CLOB_SUFFIX = _NullSequence(IonType.CLOB, _seq(b'lob'))
_NULL_LIST_SUFFIX = _NullSequence(IonType.LIST, _seq(b'ist'))
_NULL_BLOB_SUFFIX = _NullSequence(IonType.BLOB, _seq(b'ob'))
_NULL_BOOL_SUFFIX = _NullSequence(IonType.BOOL, _seq(b'ol'))
_NULL_TIMESTAMP_SUFFIX = _NullSequence(IonType.TIMESTAMP, _seq(b'imestamp'))
# The following implements a prefix tree used to determine whether a typed null keyword has been found (see
# _typed_null_handler). The leaves of the tree (enumerated above) are the terminal character sequences for the 13
# possible suffixes to 'null.'. Any other suffix to 'null.' is an error. _NULL_STARTS is entered when 'null.' is found.
_NULL_STR_NEXT = {
ord(b'i'): _NULL_STRING_SUFFIX,
ord(b'u'): _NULL_STRUCT_SUFFIX
}
_NULL_ST_NEXT = {
ord(b'r'): _NULL_STR_NEXT
}
_NULL_S_NEXT = {
ord(b'y'): _NULL_SYMBOL_SUFFIX,
ord(b'e'): _NULL_SEXP_SUFFIX,
ord(b't'): _NULL_ST_NEXT
}
_NULL_B_NEXT = {
ord(b'l'): _NULL_BLOB_SUFFIX,
ord(b'o'): _NULL_BOOL_SUFFIX
}
_NULL_STARTS = {
ord(b'n'): _NULL_SUFFIX, # null.null
ord(b's'): _NULL_S_NEXT, # null.string, null.symbol, null.struct, null.sexp
ord(b'i'): _NULL_INT_SUFFIX, # null.int
ord(b'f'): _NULL_FLOAT_SUFFIX, # null.float
ord(b'd'): _NULL_DECIMAL_SUFFIX, # null.decimal
ord(b'b'): _NULL_B_NEXT, # null.bool, null.blob
ord(b'c'): _NULL_CLOB_SUFFIX, # null.clob
ord(b'l'): _NULL_LIST_SUFFIX, # null.list
ord(b't'): _NULL_TIMESTAMP_SUFFIX, # null.timestamp
}
class _ContainerContext(NamedTuple):
"""A description of an Ion container, including the container's IonType and its textual delimiter and end character,
if applicable.
This is tracked as part of the current token's context, and is useful when certain lexing decisions depend on
which container the token is a member of. For example, ending a numeric token with ']' is not legal unless that
token is contained in a list.
Args:
end (tuple): Tuple containing the container's end character, if any.
delimiter (tuple): Tuple containing the container's delimiter character, if any.
ion_type (Optional[IonType]): The container's IonType, if any.
is_delimited (bool): True if delimiter is not empty; otherwise, False.
"""
end: tuple
delimiter: tuple
ion_type: Optional[IonType]
is_delimited: bool
_C_TOP_LEVEL = _ContainerContext((), (), None, False)
_C_STRUCT = _ContainerContext((_CLOSE_BRACE,), (_COMMA,), IonType.STRUCT, True)
_C_LIST = _ContainerContext((_CLOSE_BRACKET,), (_COMMA,), IonType.LIST, True)
_C_SEXP = _ContainerContext((_CLOSE_PAREN,), (), IonType.SEXP, False)
def _is_escaped(c):
"""Queries whether a character ordinal or code point was part of an escape sequence."""
try:
return c.is_escaped
except AttributeError:
return False
def _as_symbol(value, is_symbol_value=True):
"""Converts the input to a :class:`SymbolToken` suitable for being emitted as part of a :class:`IonEvent`.
If the input has an `as_symbol` method (e.g. :class:`CodePointArray`), it will be converted using that method.
Otherwise, it must already be a `SymbolToken`. In this case, there is nothing to do unless the input token is not a
symbol value and it is an :class:`_IVMToken`. This requires the `_IVMToken` to be converted to a regular
`SymbolToken`.
"""
try:
return value.as_symbol()
except AttributeError:
assert isinstance(value, SymbolToken)
if not is_symbol_value:
try:
# This converts _IVMTokens to regular SymbolTokens when the _IVMToken cannot represent an IVM (i.e.
# it is a field name or annotation).
return value.regular_token()
except AttributeError:
pass
return value
class _HandlerContext():
"""A context for a handler co-routine.
Args:
container (_ContainerContext): The description of the container in which this context is contained.
queue (BufferQueue): The data source for the handler.
field_name (Optional[SymbolToken]): The token representing the field name for the handled
value.
annotations (Optional[Sequence[SymbolToken]]): The sequence of annotations tokens
for the value to be parsed.
depth (int): the depth of the parser.
whence (Coroutine): The reference to the co-routine that this handler should delegate
back to when the handler is logically done.
value (Optional[bytearray|CodePointArray]): The (in-progress) value of this context's token.
ion_type (Optional[IonType]): The IonType of the current token.
pending_symbol (Optional[bytearray|CodePointArray]): A pending symbol, which may end up being an annotation,
field name, or symbol value.
quoted_text (Optional[bool]): True if this context represents quoted text; otherwise, False.
line_comment (Optional[bool]): True if this context represents a line comment; otherwise, False.
code_point (Optional[int|CodePoint]): The token's current unicode code point, if applicable.
is_self_delimiting (Optional[bool]): True if this context's token is self-delimiting (a short string, container,
or comment).
is_composite (Optional[bool]): True if this context's token is a value immediately followed by another token
discovered during lookahead.
"""
def __init__(self, container, queue, field_name, annotations, depth, whence, value, ion_type, pending_symbol,
quoted_text=False, line_comment=False, code_point=None, is_self_delimiting=False,
is_composite=False):
self.container = container
self.queue = queue
self.field_name = field_name
self.annotations = annotations
self.depth = depth
self.whence = whence
self.value = value
self.ion_type = ion_type
self.pending_symbol = pending_symbol
self.quoted_text = quoted_text
self.line_comment = line_comment
self.code_point = code_point
self.is_self_delimiting = is_self_delimiting
self.is_composite = is_composite
def event_transition(self, event_cls, event_type, ion_type, value):
"""Returns an ion event event_transition that yields to another co-routine."""
annotations = self.annotations or ()
depth = self.depth
whence = self.whence
if ion_type is IonType.SYMBOL:
if not annotations and depth == 0 and isinstance(value, _IVMToken):
event = value.ivm_event()
if event is None:
_illegal_character(None, self, 'Illegal IVM: %s.' % (value.text,))
return Transition(event, whence)
assert not isinstance(value, _IVMToken)
return Transition(
event_cls(event_type, ion_type, value, self.field_name, annotations, depth),
whence
)
def immediate_transition(self, delegate):
"""Returns an immediate transition to another co-routine."""
return Transition(None, delegate)
def read_data_event(self, whence, complete=False, can_flush=False):
"""Creates a transition to a co-routine for retrieving data as bytes.
Args:
whence (Coroutine): The co-routine to return to after the data is satisfied.
complete (Optional[bool]): True if STREAM_END should be emitted if no bytes are read or
available; False if INCOMPLETE should be emitted in that case.
can_flush (Optional[bool]): True if NEXT may be requested after INCOMPLETE is emitted as a result of this
data request.
"""
return Transition(None, _read_data_handler(whence, self, complete, can_flush))
def next_code_point(self, whence):
"""Creates a co-routine for retrieving data as code points.
This should be used in quoted string contexts.
"""
return Transition(None, _next_code_point_handler(whence, self))
def set_unicode(self, quoted_text=False):
"""Converts the context's ``value`` to a sequence of unicode code points for holding text tokens, indicating
whether the text is quoted.
"""
if isinstance(self.value, CodePointArray):
assert self.quoted_text == quoted_text
return self
self.value = CodePointArray(self.value)
self.quoted_text = quoted_text
self.line_comment = False
return self
def set_quoted_text(self, quoted_text):
"""Sets the context's ``quoted_text`` flag. Useful when entering and exiting quoted text tokens."""
self.quoted_text = quoted_text
self.line_comment = False
return self
def set_self_delimiting(self, is_self_delimiting):
"""Sets the context's ``is_self_delimiting`` flag. Useful when the end of a self-delimiting token (short string,
container, or comment) is reached.
This is distinct from the ``quoted_text`` flag because some quoted text (quoted symbols and long strings) are
not self-delimiting--they require lookahead to determine if they are complete.
"""
self.is_self_delimiting = is_self_delimiting
return self
def set_code_point(self, code_point):
"""Sets the context's current ``code_point`` to the given ``int`` or :class:`CodePoint`."""
self.code_point = code_point
return self
def derive_container_context(self, ion_type, whence):
"""Derives a container context as a child of the current context."""
if ion_type is IonType.STRUCT:
container = _C_STRUCT
elif ion_type is IonType.LIST:
container = _C_LIST
elif ion_type is IonType.SEXP:
container = _C_SEXP
else:
raise TypeError('Cannot derive container context for non-container type %s.' % (ion_type.name,))
return _HandlerContext(
container=container,
queue=self.queue,
field_name=self.field_name,
annotations=self.annotations,
depth=self.depth + 1,
whence=whence,
value=None, # containers don't have a value
ion_type=ion_type,
pending_symbol=None
)
def set_empty_symbol(self):
"""Resets the context, retaining the fields that make it a child of its container (``container``, ``queue``,
``depth``, ``whence``), and sets an empty ``pending_symbol``.
This is useful when an empty quoted symbol immediately follows a long string.
"""
self.field_name = None
self.annotations = None
self.ion_type = None
self.set_pending_symbol(CodePointArray())
return self
def derive_child_context(self, whence):
"""Derives a scalar context as a child of the current context."""
return _HandlerContext(
container=self.container,
queue=self.queue,
field_name=None,
annotations=None,
depth=self.depth,
whence=whence,
value=bytearray(), # children start without a value
ion_type=None,
pending_symbol=None
)
def set_line_comment(self, is_line_comment=True):
"""Sets the context's ``line_comment`` flag. Useful when entering or exiting a line comment."""
self.line_comment = is_line_comment
return self
def set_ion_type(self, ion_type):
"""Sets context to the given IonType."""
if ion_type is self.ion_type:
return self
self.ion_type = ion_type
self.line_comment = False
return self
def set_annotation(self):
"""Appends the context's ``pending_symbol`` to its ``annotations`` sequence."""
assert self.pending_symbol is not None
assert not self.value
annotations = (_as_symbol(self.pending_symbol, is_symbol_value=False),) # pending_symbol becomes an annotation
self.annotations = annotations if not self.annotations else self.annotations + annotations
self.ion_type = None
self.pending_symbol = None # reset pending symbol
self.quoted_text = False
self.line_comment = False
self.is_self_delimiting = False
return self
def set_field_name(self):
"""Sets the context's ``pending_symbol`` as its ``field_name``."""
assert self.pending_symbol is not None
assert not self.value
self.field_name = _as_symbol(self.pending_symbol, is_symbol_value=False) # pending_symbol becomes field name
self.pending_symbol = None # reset pending symbol
self.quoted_text = False
self.line_comment = False
self.is_self_delimiting = False
return self
def set_pending_symbol(self, pending_symbol=None):
"""Sets the context's ``pending_symbol`` with the given unicode sequence and resets the context's ``value``.
If the input is None, an empty :class:`CodePointArray` is used.
"""
if pending_symbol is None:
pending_symbol = CodePointArray()
self.value = bytearray() # reset value
self.pending_symbol = pending_symbol
self.line_comment = False
return self
def set_composite(self, is_composite):
self.is_composite = is_composite
return self
class _CompositeTransition(Transition):
"""Composes an event transition followed by an immediate transition to the handler for the next token.
This is useful when some lookahead is required to determine if a token has ended, e.g. in the case of long strings.
Args:
event_transition (Transition): A transition with a non-None IonEvent.
current_context (_HandlerContext): The context for the value contained in ``event_transition``.
next_handler (Coroutine): The handler that will lex the next token. Only None if ``next_context`` contains a
complete token (as is the case with an empty quoted symbol following a long string).
next_context (Optional[_HandlerContext]): The context for the next token. If None, a new child context
will be derived from ``ctx``.
initialize_handler (Optional[bool]): True if the ``next_handler`` coroutine needs to be initialized;
otherwise, False.
"""
def __new__(cls, event_transition, *args, **kwargs):
return Transition.__new__(cls, event_transition.event, event_transition.delegate)
def __init__(self, event_transition, current_context, next_handler, next_context=None, initialize_handler=True):
assert event_transition.event is not None
if next_context is None:
next_context = current_context.derive_child_context(current_context.whence)
next_transition = None
if next_handler is not None:
if initialize_handler:
next_handler = next_handler(next_context)
next_transition = next_context.immediate_transition(next_handler)
current_context.set_composite(True)
self.next_transition = next_transition
self.next_context = next_context
def _decode(value):
return value.decode(_ENCODING)
def _parse_number(parse_func, value, base=10):
def parse():
return parse_func(value, base)
return parse
def _base_10(parse_func, value, base, decode=False):
assert base == 10
if decode:
value = _decode(value)
return parse_func(value)
def _base_n(parse_func, value, base):
return parse_func(_decode(value), base)
# In Python 2, int() returns a long if the input overflows an int.
_parse_decimal_int = partial(_parse_number, partial(_base_10, int))
_parse_binary_int = partial(_parse_number, partial(_base_n, int), base=2)
_parse_hex_int = partial(_parse_number, partial(_base_n, int), base=16)
_parse_float = partial(_parse_number, partial(_base_10, float))
_parse_decimal = partial(_parse_number, partial(_base_10, Decimal, decode=True))
@coroutine
def _number_negative_start_handler(c, ctx):
"""Handles numeric values that start with a negative sign. Branches to delegate co-routines according to
_NEGATIVE_TABLE.
"""
assert c == _MINUS
assert len(ctx.value) == 0
ctx.set_ion_type(IonType.INT)
ctx.value.append(c)
c, _ = yield
yield ctx.immediate_transition(_NEGATIVE_TABLE[c](c, ctx))
@coroutine
def _number_zero_start_handler(c, ctx):
"""Handles numeric values that start with zero or negative zero. Branches to delegate co-routines according to
_ZERO_START_TABLE.
"""
assert c == _ZERO
assert len(ctx.value) == 0 or (len(ctx.value) == 1 and ctx.value[0] == _MINUS)
ctx.set_ion_type(IonType.INT)
ctx.value.append(c)
c, _ = yield
if _ends_value(c):
trans = ctx.event_transition(IonThunkEvent, IonEventType.SCALAR, ctx.ion_type, _parse_decimal_int(ctx.value))
if c == _SLASH:
trans = ctx.immediate_transition(_number_slash_end_handler(c, ctx, trans))
yield trans
yield ctx.immediate_transition(_ZERO_START_TABLE[c](c, ctx))
@coroutine
def _number_or_timestamp_handler(c, ctx):
"""Handles numeric values that start with digits 1-9. May terminate a value, in which case that value is an
int. If it does not terminate a value, it branches to delegate co-routines according to _NUMBER_OR_TIMESTAMP_TABLE.
"""
assert c in _DIGITS
ctx.set_ion_type(IonType.INT) # If this is the last digit read, this value is an Int.
val = ctx.value
val.append(c)
c, self = yield
trans = ctx.immediate_transition(self)
while True:
if _ends_value(c):
trans = ctx.event_transition(IonThunkEvent, IonEventType.SCALAR,
ctx.ion_type, _parse_decimal_int(ctx.value))
if c == _SLASH:
trans = ctx.immediate_transition(_number_slash_end_handler(c, ctx, trans))
else:
if c not in _DIGITS:
trans = ctx.immediate_transition(_NUMBER_OR_TIMESTAMP_TABLE[c](c, ctx))
else:
val.append(c)
c, _ = yield trans
@coroutine
def _number_slash_end_handler(c, ctx, event):
"""Handles numeric values that end in a forward slash. This is only legal if the slash begins a comment; thus,
this co-routine either results in an error being raised or an event being yielded.
"""
assert c == _SLASH
c, self = yield
next_ctx = ctx.derive_child_context(ctx.whence)
comment = _comment_handler(_SLASH, next_ctx, next_ctx.whence)
comment.send((c, comment))
# If the previous line returns without error, it's a valid comment and the number may be emitted.
yield _CompositeTransition(event, ctx, comment, next_ctx, initialize_handler=False)
def _numeric_handler_factory(charset, transition, assertion, illegal_before_underscore, parse_func,
illegal_at_end=(None,), ion_type=None, append_first_if_not=None, first_char=None):
"""Generates a handler co-routine which tokenizes a numeric component (a token or sub-token).
Args:
charset (sequence): Set of ordinals of legal characters for this numeric component.
transition (callable): Called upon termination of this component (i.e. when a character not in ``charset`` is
found). Accepts the previous character ordinal, the current character ordinal, the current context, and the
previous transition. Returns a Transition if the component ends legally; otherwise, raises an error.
assertion (callable): Accepts the first character's ordinal and the current context. Returns True if this is
a legal start to the component.
illegal_before_underscore (sequence): Set of ordinals of illegal characters to precede an underscore for this
component.
parse_func (callable): Called upon ending the numeric value. Accepts the current token value and returns a
thunk that lazily parses the token.
illegal_at_end (Optional[sequence]): Set of ordinals of characters that may not legally end the value.
ion_type (Optional[IonType]): The type of the value if it were to end on this component.
append_first_if_not (Optional[int]): The ordinal of a character that should not be appended to the token if
it occurs first in this component (e.g. an underscore in many cases).
first_char (Optional[int]): The ordinal of the character that should be appended instead of the character that
occurs first in this component. This is useful for preparing the token for parsing in the case where a
particular character is peculiar to the Ion format (e.g. 'd' to denote the exponent of a decimal value
should be replaced with 'e' for compatibility with python's Decimal type).
"""
@coroutine
def numeric_handler(c, ctx):
assert assertion(c, ctx)
if ion_type is not None:
ctx.set_ion_type(ion_type)
val = ctx.value
if c != append_first_if_not:
first = c if first_char is None else first_char
val.append(first)
prev = c
c, self = yield
trans = ctx.immediate_transition(self)
while True:
if _ends_value(c):
if prev == _UNDERSCORE or prev in illegal_at_end:
_illegal_character(c, ctx, '%s at end of number.' % (chr(prev),))
trans = ctx.event_transition(IonThunkEvent, IonEventType.SCALAR, ctx.ion_type, parse_func(ctx.value))
if c == _SLASH:
trans = ctx.immediate_transition(_number_slash_end_handler(c, ctx, trans))
else:
if c == _UNDERSCORE:
if prev == _UNDERSCORE or prev in illegal_before_underscore:
_illegal_character(c, ctx, 'Underscore after %s.' % (chr(prev),))
else:
if c not in charset:
trans = transition(prev, c, ctx, trans)
else:
val.append(c)
prev = c
c, _ = yield trans
return numeric_handler
def _exponent_handler_factory(ion_type, exp_chars, parse_func, first_char=None):
"""Generates a handler co-routine which tokenizes an numeric exponent.
Args:
ion_type (IonType): The type of the value with this exponent.
exp_chars (sequence): The set of ordinals of the legal exponent characters for this component.
parse_func (callable): Called upon ending the numeric value. Accepts the current token value and returns a
thunk that lazily parses the token.
first_char (Optional[int]): The ordinal of the character that should be appended instead of the character that
occurs first in this component. This is useful for preparing the token for parsing in the case where a
particular character is peculiar to the Ion format (e.g. 'd' to denote the exponent of a decimal value
should be replaced with 'e' for compatibility with python's Decimal type).
"""
def transition(prev, c, ctx, trans):
if c in _SIGN and prev in exp_chars:
ctx.value.append(c)
else:
_illegal_character(c, ctx)
return trans
illegal = exp_chars + _SIGN
return _numeric_handler_factory(_DIGITS, transition, lambda c, ctx: c in exp_chars, illegal, parse_func,
illegal_at_end=illegal, ion_type=ion_type, first_char=first_char)
_decimal_handler = _exponent_handler_factory(IonType.DECIMAL, _DECIMAL_EXPS, _parse_decimal, first_char=ord(b'e'))
_float_handler = _exponent_handler_factory(IonType.FLOAT, _FLOAT_EXPS, _parse_float)
def _coefficient_handler_factory(trans_table, parse_func, assertion=lambda c, ctx: True,
ion_type=None, append_first_if_not=None):
"""Generates a handler co-routine which tokenizes a numeric coefficient.
Args:
trans_table (dict): lookup table for the handler for the next component of this numeric token, given the
ordinal of the first character in that component.
parse_func (callable): Called upon ending the numeric value. Accepts the current token value and returns a
thunk that lazily parses the token.
assertion (callable): Accepts the first character's ordinal and the current context. Returns True if this is
a legal start to the component.
ion_type (Optional[IonType]): The type of the value if it were to end on this coefficient.
append_first_if_not (Optional[int]): The ordinal of a character that should not be appended to the token if
it occurs first in this component (e.g. an underscore in many cases).
"""
def transition(prev, c, ctx, trans):
if prev == _UNDERSCORE:
_illegal_character(c, ctx, 'Underscore before %s.' % (chr(c),))
return ctx.immediate_transition(trans_table[c](c, ctx))
return _numeric_handler_factory(_DIGITS, transition, assertion, (_DOT,), parse_func,
ion_type=ion_type, append_first_if_not=append_first_if_not)
_FRACTIONAL_NUMBER_TABLE = _defaultdict(
_merge_mappings(
(_DECIMAL_EXPS, _decimal_handler),
(_FLOAT_EXPS, _float_handler)
)
)
fractional_number_handler = _coefficient_handler_factory(
_FRACTIONAL_NUMBER_TABLE, _parse_decimal, assertion=lambda c, ctx: c == _DOT, ion_type=IonType.DECIMAL)
_WHOLE_NUMBER_TABLE = _defaultdict(
_merge_mappings(
{
_DOT: fractional_number_handler,
},
_FRACTIONAL_NUMBER_TABLE
)
)
_whole_number_handler = _coefficient_handler_factory(_WHOLE_NUMBER_TABLE, _parse_decimal_int,
append_first_if_not=_UNDERSCORE)
def _radix_int_handler_factory(radix_indicators, charset, parse_func):
"""Generates a handler co-routine which tokenizes a integer of a particular radix.
Args:
radix_indicators (sequence): The set of ordinals of characters that indicate the radix of this int.
charset (sequence): Set of ordinals of legal characters for this radix.
parse_func (callable): Called upon ending the numeric value. Accepts the current token value and returns a
thunk that lazily parses the token.
"""
def assertion(c, ctx):
return c in radix_indicators and \
((len(ctx.value) == 1 and ctx.value[0] == _ZERO) or
(len(ctx.value) == 2 and ctx.value[0] == _MINUS and ctx.value[1] == _ZERO)) and \
ctx.ion_type == IonType.INT
return _numeric_handler_factory(charset, lambda prev, c, ctx, trans: _illegal_character(c, ctx),
assertion, radix_indicators, parse_func, illegal_at_end=radix_indicators)
_binary_int_handler = _radix_int_handler_factory(_BINARY_RADIX, _BINARY_DIGITS, _parse_binary_int)
_hex_int_handler = _radix_int_handler_factory(_HEX_RADIX, _HEX_DIGITS, _parse_hex_int)
@coroutine
def _timestamp_zero_start_handler(c, ctx):
"""Handles numeric values that start with a zero followed by another digit. This is either a timestamp or an
error.
"""
val = ctx.value
ctx.set_ion_type(IonType.TIMESTAMP)
if val[0] == _MINUS:
_illegal_character(c, ctx, 'Negative year not allowed.')
val.append(c)
c, self = yield
trans = ctx.immediate_transition(self)
while True:
if c in _TIMESTAMP_YEAR_DELIMITERS:
trans = ctx.immediate_transition(_timestamp_handler(c, ctx))
elif c in _DIGITS:
val.append(c)
else:
_illegal_character(c, ctx)
c, _ = yield trans
class _TimestampState(IntEnum):
YEAR = 0
MONTH = 1
DAY = 2
HOUR = 3
MINUTE = 4
SECOND = 5
FRACTIONAL = 6
OFF_HOUR = 7
OFF_MINUTE = 8
class _TimestampTokens:
"""Holds the individual numeric tokens (as strings) that compose a `Timestamp`."""
def __init__(self, year=None):
fld = []
for i in iter(_TimestampState):
fld.append(None)
if year is not None:
fld[_TimestampState.YEAR] = year
self._fields = fld
def transition(self, state):
val = bytearray()
self._fields[state] = val
return val
def __getitem__(self, item):
return self._fields[item]
_ZEROS = [
b'',
b'0',
b'00',
b'000',
b'0000',
b'00000'
]
def _parse_timestamp(tokens):
"""Parses each token in the given `_TimestampTokens` and marshals the numeric components into a `Timestamp`."""
def parse():
precision = TimestampPrecision.YEAR
off_hour = tokens[_TimestampState.OFF_HOUR]
off_minutes = tokens[_TimestampState.OFF_MINUTE]
fraction = None
if off_hour is not None:
assert off_minutes is not None
off_sign = -1 if _MINUS in off_hour else 1
off_hour = int(off_hour)
off_minutes = int(off_minutes) * off_sign
if off_sign == -1 and off_hour == 0 and off_minutes == 0:
# -00:00 (unknown UTC offset) is a naive datetime.
off_hour = None
off_minutes = None
else:
assert off_minutes is None
year = tokens[_TimestampState.YEAR]
assert year is not None
year = int(year)
month = tokens[_TimestampState.MONTH]
if month is None:
month = 1
else:
month = int(month)
precision = TimestampPrecision.MONTH
day = tokens[_TimestampState.DAY]
if day is None:
day = 1
else:
day = int(day)
precision = TimestampPrecision.DAY
hour = tokens[_TimestampState.HOUR]
minute = tokens[_TimestampState.MINUTE]
if hour is None:
assert minute is None
hour = 0
minute = 0
else:
assert minute is not None
hour = int(hour)
minute = int(minute)
precision = TimestampPrecision.MINUTE
second = tokens[_TimestampState.SECOND]
if second is None:
second = 0
else:
second = int(second)
precision = TimestampPrecision.SECOND
fraction = tokens[_TimestampState.FRACTIONAL]
if fraction is not None:
fraction = Decimal(int(fraction)).scaleb(-1 * len(fraction))
return timestamp(
year, month, day,
hour, minute, second, None,
off_hour, off_minutes,
precision=precision, fractional_precision=None, fractional_seconds=fraction
)
return parse
@coroutine
def _timestamp_handler(c, ctx):
"""Handles timestamp values. Entered after the year component has been completed; tokenizes the remaining
components.
"""
assert c in _TIMESTAMP_YEAR_DELIMITERS
ctx.set_ion_type(IonType.TIMESTAMP)
if len(ctx.value) != 4:
_illegal_character(c, ctx, 'Timestamp year is %d digits; expected 4.' % (len(ctx.value),))
prev = c
c, self = yield
trans = ctx.immediate_transition(self)
state = _TimestampState.YEAR
nxt = _DIGITS
tokens = _TimestampTokens(ctx.value)
val = None
can_terminate = False
if prev == _T:
nxt += _VALUE_TERMINATORS
can_terminate = True
while True:
is_eof = can_terminate and BufferQueue.is_eof(c)
if c not in nxt and not is_eof:
_illegal_character(c, ctx, 'Expected %r in state %r.' % ([chr(x) for x in nxt], state))
if c in _VALUE_TERMINATORS or is_eof:
if not can_terminate:
_illegal_character(c, ctx, 'Unexpected termination of timestamp.')
trans = ctx.event_transition(IonThunkEvent, IonEventType.SCALAR, ctx.ion_type, _parse_timestamp(tokens))
if c == _SLASH:
trans = ctx.immediate_transition(_number_slash_end_handler(c, ctx, trans))
else:
can_terminate = False
if c == _Z:
# Z implies UTC, i.e. +00:00 local offset.
tokens.transition(_TimestampState.OFF_HOUR).append(_ZERO)
tokens.transition(_TimestampState.OFF_MINUTE).append(_ZERO)
nxt = _VALUE_TERMINATORS
can_terminate = True
elif c == _T:
nxt = _VALUE_TERMINATORS + _DIGITS
can_terminate = True
elif c in _TIMESTAMP_DELIMITERS:
nxt = _DIGITS
elif c in _DIGITS:
if prev == _PLUS or (state > _TimestampState.MONTH and prev == _HYPHEN):
state = _TimestampState.OFF_HOUR
val = tokens.transition(state)
if prev == _HYPHEN:
val.append(prev)
elif prev in (_TIMESTAMP_DELIMITERS + (_T,)):
state = _TimestampState(state + 1)
val = tokens.transition(state)
if state == _TimestampState.FRACTIONAL:
nxt = _DIGITS + _TIMESTAMP_OFFSET_INDICATORS
elif prev in _DIGITS:
if state == _TimestampState.MONTH:
nxt = _TIMESTAMP_YEAR_DELIMITERS
elif state == _TimestampState.DAY:
nxt = (_T,) + _VALUE_TERMINATORS
can_terminate = True
elif state == _TimestampState.HOUR:
nxt = (_COLON,)
elif state == _TimestampState.MINUTE:
nxt = _TIMESTAMP_OFFSET_INDICATORS + (_COLON,)
elif state == _TimestampState.SECOND:
nxt = _TIMESTAMP_OFFSET_INDICATORS + (_DOT,)
elif state == _TimestampState.FRACTIONAL:
nxt = _DIGITS + _TIMESTAMP_OFFSET_INDICATORS
elif state == _TimestampState.OFF_HOUR:
nxt = (_COLON,)
elif state == _TimestampState.OFF_MINUTE: