I have written the following function to preprocess some text data as input to machine learning algorithm. It lowercase, tokenises, removes stop words and lemmatizes, returning a string of space-separated tokens. However, this code runs extremely slowly. What can I do to optimise it?
import os
import re
import csv
import time
import nltk
import string
import pickle
import numpy as np
import pandas as pd
import pyparsing as pp
import matplotlib.pyplot as plt
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
def preprocessText(text, lemmatizer, lemma, ps):
'''
Lowercase, tokenises, removes stop words and lemmatize's using word net. Returns a string of space seperated tokens.
'''
words = text.lower()
words = re.sub("[^a-zA-Z]", " ", words)
words = word_tokenize(words)
stemmed_words = []
stops = set(stopwords.words("english"))
meaningful_words = [w for w in words if not w in stops]
text = ""
if lemmatizer == True:
pos_translate = {'J':'a', 'V':'v', 'N':'n', 'R':'r'}
meaningful_words = [lemma.lemmatize(w,pos=pos_translate[pos[0]] if pos[0] in pos_translate else 'n') for w,pos in nltk.pos_tag(meaningful_words)]
for each in meaningful_words:
if len(each) > 1:
text = text +" " + each
return text
else:
words_again = []
for each in meaningful_words:
words_again.append(ps.stem(each))
text = ""
for each in words_again:
if len(each) > 1:
text = text +" " +each
return(text)