I was inspired by Find the K most-frequent values in a vector, and originally posted an early version of this in an answer.
The aim is to have a container that maintains values ordered by their frequency, so we can easily obtain the most-frequent values at any time.
I've just implemented enough to be able to insert elements and obtain the N most frequent; the addition of member functions to make it a complete container (with iterators, removal, emplacement etc.) would be straightforward and might make an enjoyable exercise for the reader.
The insert() function operates in O(1) time, confirmed by varying the number of iterations of the test program's loop between 100,000 and 100,000,000. When I used ten million insertions, run time was about 1.1 seconds on my system (compiled with g++-12 -O3).
#include <algorithm>
#include <concepts>
#include <functional>
#include <list>
#include <map>
#include <iterator>
#include <ranges>
#include <unordered_map>
#include <utility>
#include <vector>
namespace {
template<typename Key>
concept is_hashable = requires { std::hash<Key>{}; };
}
template<typename T>
class frequency_ordered_multiset
{
template<typename Key, typename Value>
using fast_map =
std::conditional_t<is_hashable<Key>,
std::unordered_map<Key, Value>,
std::map<Key, Value>>;
using key_type =
std::conditional_t<sizeof (T) <= sizeof (std::reference_wrapper<const T>),
T, std::reference_wrapper<const T>>;
// We use a list because its iterators stay valid after modifications.
// The list is always in descending order of count.
struct item { const T val; std::size_t count; };
std::list<item> items = {};
// We use this map to find the list items by their value.
using items_iter = decltype(items.begin());
fast_map<key_type, items_iter> by_value = {};
public:
template<typename V>
requires std::assignable_from<T&, V&>
void insert(V&& val)
{
auto map_it = by_value.find(val);
if (map_it == by_value.end()) {
// first appearance - put it at end of list
items.emplace_back(std::forward<V>(val), 1);
auto list_it = items.end();
by_value.emplace(list_it->val, --list_it);
return;
}
// Update existing item
auto const list_it = map_it->second;
auto const new_count = ++list_it->count;
// Search backwards for the correct position
auto const other_it =
std::ranges::lower_bound(std::reverse_iterator{list_it}, items.rend(),
new_count, std::less{}, &item::count).base();
// Move the item (without invalidating iterators)
// This may be a no-op
items.splice(other_it, items, list_it);
}
auto most_frequent(std::size_t k)
{
return items
| std::views::take(k)
| std::views::transform(&item::val);
}
};
#include <chrono>
#include <random>
#include <iostream>
int main()
{
// check correctness
frequency_ordered_multiset<int> fom;
for (auto i: {1, 1, 3, 3, 2, 2, 2, 3, 3, 3, 3, 4, 4}) {
fom.insert(i);
std::ranges::copy(fom.most_frequent(2),
std::ostream_iterator<int>(std::cout, ", "));
std::cout << '\n';
}
// test performance
std::mt19937 r;
std::uniform_int_distribution<int> dist{0, 100};
for (auto i = 0; i < 10'000'000; ++i) {
fom.insert(dist(r));
}
std::ranges::copy(fom.most_frequent(10),
std::ostream_iterator<int>(std::cout, ", "));
std::cout << '\n';
}
Test program output:
time ./281926
1,
1,
1, 3,
1, 3,
1, 3, 2,
1, 3, 2,
2, 1, 3,
2, 3, 1,
3, 2, 1,
3, 2, 1,
3, 2, 1,
3, 2, 1,
3, 2, 1,
90, 94, 71, 10, 36, 11, 15, 83, 24, 61,
1.11user 0.00system 0:01.11elapsed 100%CPU (0avgtext+0avgdata 3568maxresident)k