I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess. It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at. I'm still a beginner in python.
def wordcount_initial(): # count the numbre of words in the input file
global num_words_initial
with open(file_initial, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
def dupfilter():
content = open(file_initial, "r").readlines()
content_set = set(content) # filter duplicates
cleandata = open(file_final, "w+") # write the text filtered in the new file
for line in content_set:
cleandata.write(line)
def wordcount_final(): # count number of words in the new file
global num_words_final
with open(file_final, 'r') as f:
for line in f:
words = line.split()
num_words_final += len(words)
if __name__ == '__main__':
num_words = 0
num_words_initial = 0
num_words_final = 0
ready = False
while not ready:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filted file?")
if file_initial and file_final != "":
wordcount_initial()
ready = True
dupfilter()
wordcount_final()
print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))
input("\nPress <ENTER> to quit the program.")