(See the next iteration.)
This time I have programmed a simple data structure called \$k\$-mer index. The actual word \$k\$-mer is a synonym of substring of length \$k\$. This data structure is built by scanning (preferably) a genomic string \$T\$ over the alphabet \$A, C, G, T\$, and at each window \$W[i \ldots i + k - 1]\$ it takes the list of indices that is mapped by \$W\$ and add \$i\$ to it.
Example
Consider that \$T = ACACTGCA\$, and \$k = 2\$. Now the index will become:
AC -> [0, 2]
CA -> [1, 6]
CT -> [3]
TG -> [4]
GC -> [5]
Code
io.github.coderodde.dna.kmerindex.DnaKmerIndex.java:
package io.github.coderodde.dna.kmerindex;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.TreeMap;
/**
* This class provides a facility for computing the {@code k}-mer index over a
* genetic string containing the base nucleotides {@code A, C, G, T}.
*
* @version 1.0.0 (Aug 16, 2025)
* @since 1.0.0 (Aug 16, 2025)
*/
public class DnaKmerIndex {
/**
* The actual {@code k}-mer index data structure.
*/
private final Map<CharSequence, List<Integer>> kmerIndex = new TreeMap<>();
/**
* Constructs this {@code k}-mer index.
*
* @param genomicString the genomic string.
* @param k the {@code k}-parameter.
*/
public DnaKmerIndex(String genomicString, int k) {
check(genomicString, k);
build(genomicString, k);
}
/**
* Returns an unmodifiable list of indices at which the input {@code k}-mer
* starts.
*
* @param kmer the target {@code k}-mer.
* @return the index of the {@code kmer}.
*/
public List<Integer> getIndex(CharSequence kmer) {
return Collections.unmodifiableList(kmerIndex.get(kmer));
}
/**
* Performs the deep copy of this {@code k}-mer index and returns the copy.
* This method might be useful for custom convertion of the index to a
* string.
*
* @return the deep copy of this {@code k}-mer index.
*/
public Map<CharSequence, List<Integer>> getCopyState() {
Map<CharSequence, List<Integer>> copy = new TreeMap<>();
for (Map.Entry<CharSequence, List<Integer>> e : kmerIndex.entrySet()) {
copy.put(e.getKey(), new ArrayList<>(e.getValue()));
}
return copy;
}
/**
* Builds the actual index structure.
*
* @param genomicString the genomic string.
* @param k the {@code k}-parameter.
*/
private void build(String genomicString, int k) {
for (int startIndex = 0;
startIndex < genomicString.length() - k + 1;
startIndex++) {
CharSequence kmer = genomicString.substring(startIndex,
startIndex + k);
if (!kmerIndex.containsKey(kmer)) {
List<Integer> indices = new ArrayList<>();
indices.add(startIndex);
kmerIndex.put(kmer, indices);
} else {
kmerIndex.get(kmer).add(startIndex);
}
}
}
/**
* Checks that the input arguments are valid.
*
* @param genomicString the genomic string from which to build the index.
* @param k the length of a {@code k}-mer.
*/
private static void check(String genomicString, int k) {
Objects.requireNonNull(genomicString,
"The input genomic string is null");
if (k < 0) {
String exceptionMessage = String.format("k(%d) < 0", k);
throw new IllegalArgumentException(exceptionMessage);
}
if (k > genomicString.length()) {
String exceptionMessage =
String.format("k(%d) > genomicString.length()(%d)",
k,
genomicString.length());
throw new IllegalArgumentException(exceptionMessage);
}
}
}
io.github.coderodde.dna.kmerindex.DnaKmerIndexToStringConverter.java:
package io.github.coderodde.dna.kmerindex;
import java.util.List;
import java.util.Map;
/**
* This class provides a facility for converting the {@code k}-mer index to a
* string.
*
* @version 1.0.0 (Aug 16, 2025)
* @since 1.0.0 (Aug 16, 2025)
*/
public class DnaKmerIndexToStringConverter {
private final DnaKmerIndex kmerIndex;
public DnaKmerIndexToStringConverter(DnaKmerIndex kmerIndex) {
this.kmerIndex = kmerIndex;
}
@Override
public String toString() {
Map<CharSequence, List<Integer>> m = kmerIndex.getCopyState();
StringBuilder sb = new StringBuilder();
for (Map.Entry<CharSequence, List<Integer>> e : m.entrySet()) {
sb.append(e.getKey())
.append(": ")
.append(e.getValue())
.append('\n');
}
if (!sb.isEmpty()) {
sb.deleteCharAt(sb.length() - 1);
}
return sb.toString();
}
}
io.github.coderodde.dna.kmerindex.RandomGeneticStringProvider.java:
package io.github.coderodde.dna.kmerindex;
import java.util.Random;
/**
* This class provides a facility for computing a random genetic string.
*
* @version 1.0.0 (Aug 16, 2025)
* @since 1.0.0 (Aug 16, 2025)
*/
public class RandomGeneticStringProvider {
private static final char[] NUCLEOTIDE_BASES = { 'A', 'C', 'G', 'T' };
/**
* Generates a random genomic string.
*
* @param length the length of the genomic string.
* @param random the random number generator.
* @return a random genomic string.
*/
public static String generate(int length, Random random) {
StringBuilder sb = new StringBuilder(length);
for (int i = 0; i < length; ++i) {
int nucleotideIndex = random.nextInt(NUCLEOTIDE_BASES.length);
sb.append(NUCLEOTIDE_BASES[nucleotideIndex]);
}
return sb.toString();
}
/**
* Generates a random genomic string.
*
* @param length the length of the genomic string.
* @return a random genomic string.
*/
public static String generate(int length) {
return generate(length, new Random());
}
/**
* Generates a random genomic string.
*
* @param length the length of the genomic string.
* @param seed the seed for the random number generator.
* @return a random genomic string.
*/
public static String generate(int length, long seed) {
return generate(length, new Random(seed));
}
}
io.github.coderodde.dna.kmerindex.demo.Demo.java:
package io.github.coderodde.dna.kmerindex.demo;
import io.github.coderodde.dna.kmerindex.DnaKmerIndex;
import io.github.coderodde.dna.kmerindex.DnaKmerIndexToStringConverter;
import io.github.coderodde.dna.kmerindex.RandomGeneticStringProvider;
/**
* This class provides the demonstration program for the {@code k}-mer index.
*
* @version 1.0.0 (Aug 16, 2025)
* @since 1.0.0 (Aug 16, 2025)
*/
public final class Demo {
private static final int LENGTH = 30;
private static final int K = 2;
public static void main(String[] args) {
long seed = System.currentTimeMillis();
System.out.println("seed = " + seed);
String genomicString = RandomGeneticStringProvider.generate(LENGTH,
seed);
System.out.println("Genomic string: " + genomicString);
DnaKmerIndex kmerIndex = new DnaKmerIndex(genomicString, K);
System.out.println(new DnaKmerIndexToStringConverter(kmerIndex));
}
}
Example output
seed = 1755350850543
Genomic string: CCTCCAAAGTTGTCGCGTTTGAGACGCCAG
AA: [5, 6]
AC: [23]
AG: [7, 21, 28]
CA: [4, 27]
CC: [0, 3, 26]
CG: [13, 15, 24]
CT: [1]
GA: [20, 22]
GC: [14, 25]
GT: [8, 11, 16]
TC: [2, 12]
TG: [10, 19]
TT: [9, 17, 18]
Critique request
As always, I am eager to hear any constructive commentary on my attempt.