Below you see the product of my first baby-steps in programming. The purpose of the script is twofold: 1. Take html input of a specific website, process it, and return relevant info such as document id, text, and headline. 2. Generate a count of the words in all the articles.
The script is working and does what it is supposed to, however, I cannot help but feel that I'm missing a lot in terms of performance.
import re
import pandas as pd
from urllib.request import urlopen as uReq
from sklearn.feature_extraction.text import CountVectorizer
TAG_RE = re.compile(r'<[^>]+>')
def RemoveTags(text):
"""Remove all html tags"""
return TAG_RE.sub('', text)
ESCAPES_RE = re.compile(r'\\.')
def RemoveEscapes(text):
"""Remove extra escape characters from encoding"""
return ESCAPES_RE.sub('', text)
def ReadFromLink(link):
"""Read html from link and return raw html"""
with uReq(link) as response:
html = response.read()#
html = str(html).lower()
return html.lower()
def ArticleRaw(html):
"""Find articles in html"""
article = re.findall(r'<doc>.*?</doc>', html)
return article
def GetDocID(html):
"""Find document ids in html"""
docid = re.findall(r'<docid>(.*?)</docid>', html)
docid = [docid.strip() for docid in docid]
docid = [int(docid) for docid in docid]
return docid
def GetHeadline(html):
"""Find headlines in html"""
headline = re.findall(r'<headline>(.*?)</headline>', html)
headline = [RemoveTags(headline) for headline in headline]
headline = [RemoveEscapes(headline) for headline in headline]
return headline
def GetMainText(html):
"""Find maintext in html"""
maintext = re.findall(r'<text>(.*?)</text>', html)
maintext = [RemoveTags(maintext) for maintext in maintext]
maintext = [RemoveEscapes(maintext) for maintext in maintext]
maintext = [' '.join(maintext.split()) for maintext in maintext]
return maintext
link = link
html = ReadFromLink(link)
ArticlesDict = {
"docid": GetDocID(html),
"raw_article": ArticleRaw(html),
"headline": GetHeadline(html),
"maintext": GetMainText(html)
}
def CountFeatures(text):
documents = ArticlesDict['maintext']
# Stem first?
vector = CountVectorizer()
x = vector.fit_transform(documents)
df_features = pd.DataFrame(x.toarray(), columns = vector.get_feature_names())
return df_features
df_features = CountFeatures(df_articles['maintext'])