A minor addition: the "sample" test case is ambiguous. Consider:
$$s = 0.500,001$$
$$s - 1.000,002 - (-0.5) =
s - 0.000,002 - 0.5 = -0.000,001
$$
With synthesized tests you don't want to allow for nondeterministic behaviour. Better to choose inputs that unambiguously point toward one correct answer.
LP: Don't do this
It's possible (though not advisable) to reframe your implementation as a mixed-integer linear programming problem where:
- the structural variables are binary selection coefficients into the metabolite and adduct vectors
- there are three auxiliary variables: to minimize the objective, and one for each of metabolite and adduct to enforce exactly one choice
- since an abs needs to be applied, it requires two passes per value of
s
This works(ish) but is very slow.
#include <assert.h>
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <glpk.h>
#define VERBOSE 1
const double epsilon = 1e-10;
static void fatal(const char *msg) {
fprintf(stderr, "%s\n", msg);
exit(1);
}
static void pfatal(const char *msg) {
perror(msg);
exit(1);
}
static void usage(const char *cmd) {
fprintf(stderr, "Usage: %s problem-number [...]\n", cmd);
exit(1);
}
static void open_files(int i_problem, FILE **file_in, FILE **file_ans) {
char filename_in[NAME_MAX], filename_ans[NAME_MAX];
snprintf(filename_in, NAME_MAX, "%d.txt", i_problem);
*file_in = fopen(filename_in, "r");
if (!*file_in)
pfatal("Failed to open input file");
snprintf(filename_ans, NAME_MAX, "ans%d.txt", i_problem);
*file_ans = fopen(filename_ans, "r");
if (!*file_ans)
pfatal("Failed to open output file");
}
static void read_line(FILE *file, char *line, int n) {
if (!fgets(line, n, file))
pfatal("Input I/O");
if (line[strlen(line) - 1] != '\n')
fatal("Input line too long");
}
static void read_ints(FILE *file, int *array, int n) {
const int field_chars = 12, buf_size = n*field_chars;
char *line = malloc(buf_size);
if (!line)
pfatal("No memory for line");
read_line(file, line, buf_size);
const char *field = line;
for (int i = 0; i < n; i++) {
int consumed;
if (sscanf(field, "%d%n", array + i, &consumed) != 1)
pfatal("Bad input");
field += consumed;
}
free(line);
}
static double *read_doubles(FILE *file, int n) {
const int field_chars = 12, buf_size = n*field_chars;
char *line = malloc(buf_size);
if (!line)
pfatal("No memory for line");
read_line(file, line, buf_size);
double *array = malloc(n * sizeof(double));
if (!array)
pfatal("No memory for input");
const char *field = line;
for (int i = 0; i < n-1; i++) {
int consumed;
if (sscanf(field, "%lf%n", array + i, &consumed) != 1)
pfatal("Bad input");
field += consumed;
}
free(line);
return array;
}
static void read_case(
FILE *file_in,
int *M, int *K, int *N,
double **m, double **a, double **s
) {
char line[256];
read_line(file_in, line, sizeof(line));
if (sscanf(line, "%d %d %d\n", M, K, N) != 3)
fatal("Incorrect test case header format");
printf("M=%d K=%d N=%d ", *M, *K, *N);
#if VERBOSE
putchar('\n');
#endif
fflush(stdout);
if (*M < 1) fatal("Out-of-range M");
if (*K < 1) fatal("Out-of-range K");
if (*N < 1) fatal("Out-of-range N");
*m = read_doubles(file_in, *M), // metabolites
*a = read_doubles(file_in, *K), // adducts
*s = read_doubles(file_in, *N); // signals
}
/*
For each given s, choose one m and one a to minimize |s - m - a|.
Show the indices in m and a.
In GLPK terms,
x[:M+K]: structural "col" variables, the actual selection coefficients
x'[:3]: auxiliary "row" variables, to constrain the solution
z: objective, should approach s
c: objective coefficients, equal to m and a concatenated
A: constraint coefficients, three constraint rows, one col for each m,a
l, u: lower and upper bounds
c
z = [m m a a a][x]
[x]
[x]
[x]
[x]
A
[x'] [m m a a a][x]
[x'] = [1 1 0 0 0][x]
[x'] [0 0 1 1 1][x]
[x]
[x]
Min|maximize z = cx subject to x' = Ax, l <= x <= u, l' <= x' <= u'
Synthesizing "minimize abs(s - m - a)" translates to:
- Maximize m+a subject to m+a <= s
- Minimize m+a subject to m+a >= s
- Take whichever solution is closer to s
*/
static glp_prob *make_prob(
int i_problem, int i_test,
int M, int K, const double *m, const double *a
) {
glp_term_out(GLP_OFF);
glp_prob *lp = glp_create_prob();
char name[64];
snprintf(name, sizeof(name), "stepik-bioinfo-2021-%d.%d", i_problem, i_test);
glp_set_prob_name(lp, name);
glp_set_obj_name(lp, "m+a");
// auxiliary "row" variables:
// 0: tracking the objective function, to enforce minimum or maximum
// 1: metabolite selection sum equal to 1
// 2: adduct selection sum equal to 1
glp_add_rows(lp, 3);
glp_set_row_name(lp, 1, "objective_limit");
glp_set_row_name(lp, 2, "fixed_sum_metabolite");
glp_set_row_name(lp, 3, "fixed_sum_adduct");
// set_row_bnds(lp, 1) deferred to the min/max step
glp_set_row_bnds(lp, 2, GLP_FX, 1, 1);
glp_set_row_bnds(lp, 3, GLP_FX, 1, 1);
// structural "column" variables, M+K selection vector of metabolites and
// adducts in [0, 1]
glp_add_cols(lp, M + K);
// The glpk array convention is dumb and 1-indexed, meaning every input
// array needs a dummy prefix
const int row_ind[4] = {INT_MIN, 1, 2, 3};
char col_name[16];
// Metabolites
for (int i = 0; i < M; i++) {
snprintf(col_name, sizeof(col_name), "m_%d", i+1);
glp_set_col_name(lp, i+1, col_name);
glp_set_col_kind(lp, i+1, GLP_BV);
// implied: glp_set_col_bnds(lp, i+1, GLP_DB, 0, 1);
glp_set_obj_coef(lp, i+1, m[i]);
double constraints[4] = {NAN, m[i], 1, 0};
glp_set_mat_col(lp, i+1, 3, row_ind, constraints);
}
// Adducts
for (int i = 0; i < K; i++) {
snprintf(col_name, sizeof(col_name), "a_%d", i+1);
glp_set_col_name(lp, i+M+1, col_name);
glp_set_col_kind(lp, i+M+1, GLP_BV);
// implied: glp_set_col_bnds(lp, i+M+1, GLP_DB, 0, 1);
glp_set_obj_coef(lp, i+M+1, a[i]);
double constraints[4] = {NAN, a[i], 0, 1};
glp_set_mat_col(lp, i+M+1, 3, row_ind, constraints);
}
return lp;
}
static int find_selected(glp_prob *lp, int n, int offset) {
for (int i = 0; i < n; i++) {
if (glp_mip_col_val(lp, i + offset + 1) > 0.5)
return i;
}
fatal("Selected index not found");
}
static double optimize(
glp_prob *lp, int direction, int i_s, double s,
const double *m, const double *a,
int M, int K, int *j_max, int *k_max
) {
const char *dir_str = direction == GLP_MIN ? "min" : "max";
#if VERBOSE
printf(" [%d] %s ", i_s, dir_str);
#endif
// Reset between optimization runs
// glp_std_basis(lp);
glp_set_obj_dir(lp, direction);
int bound = direction == GLP_MIN ? GLP_LO : GLP_UP;
glp_set_row_bnds(lp, 1, bound, s, s);
int err = glp_simplex(lp, NULL);
if (err) glp_error("GLPK simplex failure %d\n", err);
int stat = glp_get_status(lp);
if (stat == GLP_OPT) {
err = glp_intopt(lp, NULL);
if (err) glp_error("GLPK MIP failure %d\n", err);
stat = glp_mip_status(lp);
}
if (stat != GLP_OPT) {
#if VERBOSE
printf("%lf: infeasible\n", s);
#endif
return INFINITY;
}
double obj = glp_mip_obj_val(lp);
#if VERBOSE
if (direction == GLP_MIN) printf("%.2le <- %.2le ", s, obj);
else printf("%.2le -> %.2le ", obj, s);
#endif
*j_max = find_selected(lp, M, 0);
*k_max = find_selected(lp, K, M);
double error = fabs(obj - s);
#if VERBOSE
printf(
"j=%d k=%2d err=%.1le act_err=%+.1le\n",
*j_max+1, *k_max+1, error,
s - m[*j_max] - a[*k_max]
);
#endif
return error;
}
static void test_case(int i_problem, int i_test, FILE *file_in, FILE *file_ans) {
printf("problem %d.%d ", i_problem, i_test);
int M, K, N;
double *m, *a, *s;
read_case(file_in, &M, &K, &N, &m, &a, &s);
glp_prob *lp = make_prob(i_problem, i_test, M, K, m, a);
int matches = 0;
int expected[2];
for (int i_s = 0; i_s < N; i_s++) {
int j = -1, k = -1;
// Minimize m+a subject to m+a >= s
double error = optimize(lp, GLP_MIN, i_s, s[i_s], m, a, M, K, &j, &k);
if (error > epsilon) {
int j1 = -1, k1 = -1;
// Maximize m+a subject to m+a <= s
double error1 = optimize(lp, GLP_MAX, i_s, s[i_s], m, a, M, K, &j1, &k1);
if (error > error1) {
error = error1;
j = j1; k = k1;
}
}
if (j < 0 || k < 0) fatal("No solution");
read_ints(file_ans, expected, 2);
#if VERBOSE
printf(" Act %2d %2d exp %2d %2d\n", j+1, k+1, expected[0], expected[1]);
#endif
if (j+1 == expected[0] && k+1 == expected[1])
matches++;
}
glp_delete_prob(lp);
free(m); free(a); free(s);
printf(" matched %d/%d\n", matches, N);
}
int main(int argc, const char **argv) {
if (argc < 2) usage(*argv);
printf("Using glpk %s\n", glp_version());
for (int a = 1; a < argc; a++) {
FILE *file_in, *file_ans;
int i_problem;
if (sscanf(argv[a], "%d", &i_problem) != 1)
usage(*argv);
open_files(i_problem, &file_in, &file_ans);
int T;
if (fscanf(file_in, "%d\n", &T) != 1) fatal("Bad test count");
if (T < 1 || T > 3) fatal("Out-of-range test count");
for (int i_test = 0; i_test < T; i_test++) {
test_case(i_problem, i_test, file_in, file_ans);
}
}
return 0;
}
Numpy vectorization
It's possible to use something vaguely close to your original implementation but using all numpy and no loops. This works-ish for problems 1.1, 2.2 and almost everything in 3.2 but
- there's a few stray mismatches in 3.2;
- I wasn't careful enough with memory so problem 4.1 dies from OOM - this could be fixed by switching to a KN lookup instead of an MK lookup; and
- Problems 2.1 and 3.1 are totally wrong for some reason;
but it's still possible as a proof-of-concept to demonstrate how you would take your algorithm and vectorize it.
from sys import argv
import numpy as np
def solve_case(m: np.ndarray, a: np.ndarray, s: np.ndarray) -> np.ndarray:
mrep = np.tile(m, len(a))
jrep = np.tile(np.arange(len(m), dtype=np.int32), len(a))
arep = np.repeat(a, len(m))
krep = np.repeat(np.arange(len(a), dtype=np.int32), len(m))
jk = np.vstack((jrep, krep))
masum = mrep + arep
order = masum.argsort()
jk[:] = jk[:, order]
masum[:] = masum[order]
i = np.searchsorted(masum, s)
lower = np.abs(s - masum[i - 1])
upper = np.abs(s - masum[i])
adj = lower < upper
res = jk[:, i - adj]
return res.T + 1
def solve(i_problem: int) -> None:
with open(f'{i_problem}.txt') as file_in, \
open(f'ans{i_problem}.txt') as file_ans:
T = int(next(file_in))
for i_case in range(1, T + 1):
M, K, N = (int(x) for x in next(file_in).split())
print(f'problem {i_problem}.{i_case}: M={M} K={K} N={N}', end=' ')
m, a, s = (
np.genfromtxt(file_in, dtype=np.float64, max_rows=1)
for _ in range(3)
)
assert m.shape == (M,)
assert a.shape == (K,)
assert s.shape == (N,)
actual = solve_case(m, a, s)
expected = np.genfromtxt(file_ans, dtype=np.int32, max_rows=N)
matched = np.sum(actual == expected) / actual.size
print(f'{matched:.2%} matched')
def main() -> None:
for arg in argv[1:]:
solve(int(arg))
if __name__ == '__main__':
main()
masseswould need to be scanned for each signal, andMis kind of large in test case 4. Feel free to post an answer, will help me to understand better your idea. \$\endgroup\$