As @TorbenPutkonen says, use a proper type instead of abusing Map.Entry
I would further make use of the Stream APIs. Don't do a get
on your map after calling computeIfAbsent
.. you already have the entry.
This is slightly cleaner...
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
static class LineCounts {
final ArrayList<String> lines = new ArrayList<>();
int count;
}
/**
* Main method to aggregate and count the occurrences of a regex pattern in a text file.
*
* @param args input text file, aggregate regex, and optional ignore regex.
* @throws Exception if the input file is not found or cannot be read.
*/
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.out.println("""
Usage: java -jar <jar file> <input text file> <aggregate regex> (<ignore regex>)
Example: java -jar <jar file> "input.txt" ".*? (\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .*"
to aggregate IP addresses from the input file.""");
throw new IllegalArgumentException("Invalid number of arguments");
}
Pattern aggregatePattern = Pattern.compile(args[1]);
Pattern ignorePattern = Pattern.compile(args.length > 2 ? args[2] : "(?!x)x");
LinkedHashMap<String, LineCounts> map = new LinkedHashMap<>();
Files.lines(Paths.get(args[0]), Charset.defaultCharset()).forEach(line -> {
Matcher aggregateMatcher = aggregatePattern.matcher(line);
if (aggregateMatcher.find() && !ignorePattern.matcher(line).find()) {
String key = aggregateMatcher.group(1);
LineCounts entry = map.computeIfAbsent(key, k -> new LineCounts());
entry.lines.add(line);
entry.count++;
}
});
map.entrySet().stream()
// Sort by count in descending order, if counts are equal, sort by input order (earlier first).
.sorted((o1, o2) -> o2.getValue().count - o1.getValue().count)
.forEach(entry -> {
System.out.printf("%04d - %s - %s%n", entry.getValue().count, entry.getKey(), entry.getValue().lines.get(0));
});
}
}
```