I have tried following the formulas for Term frequency–Inverse document frequency (TF-IDF) calculation and Cosine similarity calculation, and translated it into code. The results I get seems to be working how it should, but I am worried about having missed something, or if I have not done the TF-IDF calculations or cosine calculations correctly. If someone could please review it, give some pointers or some critique, it would be a huge help
This is my class for calculations
package dk.processfactory.dmrpraktik.util;
import java.util.*;
public class TFIDF<T> {
private List<String> documents = new ArrayList<>();
private List<String> vocabularyList = new ArrayList<>();
private Map<Integer,double[]> tfIdfVectors = new HashMap<>();
/**
* Makes documents out of a list of objects, by turning them into strings
* @param objects
* @return List of objects as strings
*/
private List<String> splitIntoDocuments(List<T> objects) {
List<String> documents = new ArrayList<>();
for (T object : objects) {
documents.add(object.toString().toLowerCase());
}
return documents;
}
/**
* A function to extract all terms from a document
* @param document
* @return a list of all terms in that document
*/
private List<String> terms(String document){
List<String> terms = new ArrayList<>();
//given an object string it will replace symbols with space, to seperate terms
document = document.replaceAll("[,={}\\[\\]]", " ");
String[] termsplit = document.split("\\s+");
for (String term : termsplit) {
terms.add(term);
}
return terms;
}
/**
* Calculates TF value, the frequency of a term within a document.
* TF = number of times term occurs in document/total number of terms in document.
* @param term
* @param terms
* @return TF value for a term in a document
*/
private double tf(String term, List<String> terms){
return (double) occurenceOfTerm(term, terms) / totalTerms(terms);
}
private int totalTerms(List<String> terms){
return terms.size();
}
private int occurenceOfTerm(String term,List<String> terms){
return Collections.frequency(terms, term);
}
/**
* Calculates IDF value, how common a word is in the corpus.
* IDF = log(total number of documents / documents that contain the term)
* @param term
* @param documents
* @return
*/
private double idf(String term, List<String> documents){
int n = documents.size();
int df = 0;
for (String document : documents) {
if (terms(document).contains(term)) {
df++;
}
}
if(df==0){
return 0;
}
return Math.log(Double.valueOf(n)/Double.valueOf(df));
}
/**
* Uses the function tf and idf to calculate tfidf values for all terms in the documents and creating vectors for each document
* Where each tfidf value is a dimension in the documents vector
* @param objects
* @return A Map where key is the document/objects index and value is its vector
*/
public Map<Integer, double[]> createTFIDFVectors(List<T> objects){
documents = splitIntoDocuments(objects);
Set<String> vocabulary = new HashSet<>();
for (String document : documents) {
vocabulary.addAll(terms(document));
}
vocabularyList = new ArrayList<>(vocabulary);
for(int docIndex = 0; docIndex < documents.size(); docIndex++){
String document = documents.get(docIndex);
List<String> terms = terms(document);
double[] tfIdfVector = new double[vocabularyList.size()];
for(int termIndex = 0; termIndex < vocabularyList.size(); termIndex++){
String term = vocabularyList.get(termIndex);
double tf = tf(term, terms);
double idf = idf(term, documents);
tfIdfVector[termIndex] = tf * idf;
}
tfIdfVectors.put(docIndex, tfIdfVector);
}
return tfIdfVectors;
}
/**
* A function to calculate cosine similarity between two vectors, it is used to find the most similar documents
* A cosine value of 0 means no similarity and 1 means identical. The value can be everything between 0 and 1
* @param vectorA
* @param vectorB
* @return Cosine similarity
*/
public double cosineSimilarity(double[] vectorA, double[] vectorB){
double dotProduct = 0.0;
double normA = 0.0;
double normB = 0.0;
for(int i = 0; i < vectorA.length; i++){
dotProduct += vectorA[i] * vectorB[i];
normA += Math.pow(vectorA[i], 2);
normB += Math.pow(vectorB[i], 2);
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
/**
* Used to create a vector with the corbus from a document.
* @param document
* @return A vector representing the document
*/
private double[] createVectorFromCorbus(String document){
List<String> terms = terms(document);
double[] tfIdfVector = new double[vocabularyList.size()];
for(int termIndex = 0; termIndex < vocabularyList.size(); termIndex++){
String term = vocabularyList.get(termIndex);
double tf = tf(term, terms);
double idf = idf(term, documents);
tfIdfVector[termIndex] = tf * idf;
}
return tfIdfVector;
}
/**
* Used to list the most similar documents to the given object in the corpus
* @param object the given object that represents a document
* @return a list with indexes of documents, where most similar is first.
*/
public List<Integer> listMostSimilarDescend(String object){
double[] objectVector = createVectorFromCorbus(object);
Map<Integer,Double> cosineSimilarities = new HashMap<>();
tfIdfVectors.forEach((document, vector) -> {
double cosineSimilarity = cosineSimilarity(objectVector, vector);
cosineSimilarities.put(document, cosineSimilarity);
});
List<Integer> mostSimilar = cosineSimilarities.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.map(Map.Entry::getKey)
.toList();
return mostSimilar;
}
}
This is an implementation so you can see the purpose
public static void main(String[] args) {
SpringApplication.run(TfidfApplication.class, args);
String doc1 = "The cat sat on the mat";
String doc2 = "The dog sat on the mat";
String doc3 = "dog and dogs are animals";
List<String> documents = Arrays.asList(doc1, doc2, doc3);
TFIDF<String> tfidf = new TFIDF<>();
Map<Integer,double[]> tfIdfVectors = tfidf.createTFIDFVectors(documents);
System.out.println("TF-IDF Vectors:");
for(Map.Entry<Integer,double[]> entry: tfIdfVectors.entrySet()){
System.out.println("Document "+entry.getKey());
double[] tfidfvector = entry.getValue();
for (double v : tfidfvector) {
System.out.print(v + " ");
}
System.out.println();
}
System.out.println("Cosine between doc0 and doc1: "+tfidf.cosineSimilarity(tfIdfVectors.get(0), tfIdfVectors.get(1)));
System.out.println("Cosine between doc1 and doc2: "+tfidf.cosineSimilarity(tfIdfVectors.get(1), tfIdfVectors.get(2)));
System.out.println("Most similar documents: "+tfidf.listMostSimilarDescend("cat sat the mat"));
}
Results of main function:
TF-IDF Vectors:
Document 0
0.06757751801802739 0.06757751801802739 0.06757751801802739 0.0 0.0 0.1831020481113516 0.06757751801802739 0.0 0.0 0.0 0.06757751801802739
Document 1
0.06757751801802739 0.06757751801802739 0.06757751801802739 0.0 0.0 0.0 0.06757751801802739 0.0 0.0 0.06757751801802739 0.06757751801802739
Document 2
0.0 0.0 0.0 0.21972245773362198 0.21972245773362198 0.0 0.0 0.21972245773362198 0.21972245773362198 0.08109302162163289 0.0
Cosine between doc0 and doc1: 0.5810469954347838
Most similar documents: [0, 1, 2]