-
Notifications
You must be signed in to change notification settings - Fork 882
/
Copy pathtext_handler.py
147 lines (125 loc) · 4.3 KB
/
text_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
Base module for all text based default handler.
Contains various text based utility methods
"""
import logging
import os
import re
import string
import unicodedata
from abc import ABC
import torch
import torch.nn.functional as F
from captum.attr import LayerIntegratedGradients
from ts.handler_utils.text_utils import get_tokenizer
from ..utils.util import CLEANUP_REGEX
from .base_handler import BaseHandler
from .contractions import CONTRACTION_MAP
logger = logging.getLogger(__name__)
CONTRACTIONS_PATTERN = re.compile(
"({})".format("|".join(CONTRACTION_MAP.keys())),
flags=re.IGNORECASE | re.DOTALL,
)
class TextHandler(BaseHandler, ABC):
"""
Base class for all text based default handler.
Contains various text based utility methods
"""
def __init__(self):
super().__init__()
self.source_vocab = None
self.tokenizer = get_tokenizer("basic_english")
self.input_text = None
self.lig = None
self.initialized = None
def initialize(self, context):
"""
Loads the model and Initializes the necessary artifacts
"""
super().initialize(context)
self.initialized = False
source_vocab = (
self.manifest["model"]["sourceVocab"]
if "sourceVocab" in self.manifest["model"]
else None
)
if source_vocab:
# Backward compatibility
self.source_vocab = torch.load(source_vocab)
else:
self.source_vocab = torch.load(self.get_source_vocab_path(context))
# Captum initialization
self.lig = LayerIntegratedGradients(self.model, self.model.embedding)
self.initialized = True
def get_source_vocab_path(self, ctx):
properties = ctx.system_properties
model_dir = properties.get("model_dir")
source_vocab_path = os.path.join(model_dir, "source_vocab.pt")
if os.path.isfile(source_vocab_path):
return source_vocab_path
else:
raise Exception(
"Missing the source_vocab file. Refer default handler "
"documentation for details on using text_handler."
)
def _expand_contractions(self, text):
"""
Expands the contracted words in the text
"""
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = (
CONTRACTION_MAP.get(match)
if CONTRACTION_MAP.get(match)
else CONTRACTION_MAP.get(match.lower())
)
expanded_contraction = first_char + expanded_contraction[1:]
return expanded_contraction
text = CONTRACTIONS_PATTERN.sub(expand_match, text)
text = re.sub("'", "", text)
return text
def _remove_accented_characters(self, text):
"""
Removes remove_accented_characters
"""
text = (
unicodedata.normalize("NFKD", text)
.encode("ascii", "ignore")
.decode("utf-8", "ignore")
)
return text
def _remove_html_tags(self, text):
"""
Removes html tags
"""
clean_text = CLEANUP_REGEX.sub("", text)
return clean_text
def _remove_puncutation(self, *args, **kwargs):
"""
Mispelled in original version. This is a compat layer
"""
return self._remove_punctuation(*args, **kwargs)
def _remove_punctuation(self, text):
"""
Removes punctuation
"""
return text.translate(str.maketrans("", "", string.punctuation))
def _tokenize(self, text):
return self.tokenizer(text)
def get_word_token(self, input_tokens):
"""
Constructs word tokens from text
"""
# Remove unicode space character from BPE Tokeniser
tokens = [token.replace("Ġ", "") for token in input_tokens]
return tokens
def summarize_attributions(self, attributions):
"""
Summarises the attribution across multiple runs
"""
attributions = F.softmax(attributions)
attributions_sum = attributions.sum(dim=-1)
logger.info("attributions sum shape %d", attributions_sum.shape)
attributions = attributions / torch.norm(attributions_sum)
return attributions