Thanks to the answer of @GZ0, the performance of this code snippet is now around 0.0319s on a GPU and around 0.256s on a CPU. The implementation of @GZ0's algorithm is attached. Please do not hesitate to suggest any modifications to make the code snippet more pythonic :)
import numpy as np
import torch
import time
USE_CUDA = torch.cuda.is_available()
groundtruth_masks = np.load('./masks.npy')
pred_mask = np.load('./pred_mask.npy')
n_patch = groundtruth_masks.shape[0]
groundtruth_masks = torch.from_numpy(groundtruth_masks)
pred_mask = torch.from_numpy(pred_mask)
if USE_CUDA:
groundtruth_masks = groundtruth_masks.cuda()
pred_mask = pred_mask.cuda()
start = time.time()
vector_pred = pred_mask.view(n_patch, -1)
vector_gt = groundtruth_masks.view(n_patch, -1)
vector_pred, sort_pred_idx = torch.sort(vector_pred, descending=True)
vector_gt = vector_gt[torch.arange(vector_gt.shape[0])[:, None], sort_pred_idx]
gt_cumsum = torch.cumsum(vector_gt, dim=1)
gt_total = vector_gt.sum(dim=1)
gt_total = gt_total.reshape(n_patch, 1)
predicted = torch.arange(start=1, end=vector_pred.shape[1] + 1)
if USE_CUDA:
predicted = predicted.cuda()
gt_cumsum = gt_cumsum.type(torch.float)
gt_total = gt_total.type(torch.float)
predicted = predicted.type(torch.float)
jaccard_idx = gt_cumsum / (gt_total + predicted - gt_cumsum)
max_jaccard_idx, max_indices = torch.max(jaccard_idx, dim=1)
max_indices = max_indices.reshape(-1, 1)
best_threshold = vector_pred[torch.arange(vector_pred.shape[0])[
:, None], max_indices]
best_threshold = best_threshold.reshape(-1)
end = time.time()
print('Best Threshold: ', best_threshold)
print('Best Jaccard Index: ', max_jaccard_idx)
print(end - start)