Skip to main content
Respond to @J_H's feedback, provide alternate version using a data structure with labeled fields
Source Link
Setris
  • 1.7k
  • 1
  • 11
  • 17

EDIT: Based on @J_H's suggestion, here's a modified version of the above using a data structure with labeled fields:

from collections import Counter
from dataclasses import dataclass


@dataclass(frozen=True)
class Entry:
    frequency: int
    letter: str
    source: str


def frequencies(s: str, source: str) -> list[Entry]:
    counter = Counter(c for c in s if c.islower())
    return [
        Entry(frequency, letter, source)
        for letter, frequency in counter.items()
        if frequency > 1
    ]


def mix_string(letter_frequencies: list[Entry]) -> str:
    return "/".join(
        f"{entry.source}:{entry.frequency*entry.letter}"
        for entry in letter_frequencies
    )


def mix(s1: str, s2: str) -> str:
    seen_letters = set()
    letter_frequencies = frequencies(s1, "1") + frequencies(s2, "2")
    if not letter_frequencies:
        return ""
    letter_frequencies.sort(key=lambda x: (-x.frequency, x.letter))
    stack = [letter_frequencies[0]]
    seen_letters.add(letter_frequencies[0].letter)
    for entry in letter_frequencies[1:]:
        top = stack[-1]
        if entry.frequency == top.frequency and entry.letter == top.letter:
            stack.pop()
            stack.append(Entry(entry.frequency, entry.letter, "="))
        elif entry.letter in seen_letters:
            continue
        else:
            stack.append(entry)
            seen_letters.add(entry.letter)
    stack.sort(key=lambda x: (-x.frequency, x.source, x.letter))
    return mix_string(stack)

EDIT: Based on @J_H's suggestion, here's a modified version of the above using a data structure with labeled fields:

from collections import Counter
from dataclasses import dataclass


@dataclass(frozen=True)
class Entry:
    frequency: int
    letter: str
    source: str


def frequencies(s: str, source: str) -> list[Entry]:
    counter = Counter(c for c in s if c.islower())
    return [
        Entry(frequency, letter, source)
        for letter, frequency in counter.items()
        if frequency > 1
    ]


def mix_string(letter_frequencies: list[Entry]) -> str:
    return "/".join(
        f"{entry.source}:{entry.frequency*entry.letter}"
        for entry in letter_frequencies
    )


def mix(s1: str, s2: str) -> str:
    seen_letters = set()
    letter_frequencies = frequencies(s1, "1") + frequencies(s2, "2")
    if not letter_frequencies:
        return ""
    letter_frequencies.sort(key=lambda x: (-x.frequency, x.letter))
    stack = [letter_frequencies[0]]
    seen_letters.add(letter_frequencies[0].letter)
    for entry in letter_frequencies[1:]:
        top = stack[-1]
        if entry.frequency == top.frequency and entry.letter == top.letter:
            stack.pop()
            stack.append(Entry(entry.frequency, entry.letter, "="))
        elif entry.letter in seen_letters:
            continue
        else:
            stack.append(entry)
            seen_letters.add(entry.letter)
    stack.sort(key=lambda x: (-x.frequency, x.source, x.letter))
    return mix_string(stack)
Source Link
Setris
  • 1.7k
  • 1
  • 11
  • 17

Re: your question about a better way of de-deduplicating, a nice way of de-duplicating frequency entries while also removing the need to do entry cleanup via clean_dict (removing frequency entries that have a frequency count greater than 1 but also less than the most frequent count of that letter between the two input strings) is by taking the list comprised of frequency entries from both input strings and sorting it by frequency (descending) and letter (lexicographic ascending), in that order. Then take that sorted list and process it one by one while maintaining a set of "seen letters" to avoid including the above-described entries that would have been removed by clean_dict.

By sorting the entries in this way we know that frequency entries with the same frequency and letter will end up next to each other, and if we start with a stack and process entries one by one, we can efficiently de-duplicate by comparing each current entry with the top of the stack.

Here's an example implementation of this idea:

from collections import Counter


def frequencies(s: str, source: str) -> list[tuple[int, str, str]]:
    counter = Counter(c for c in s if c.islower())
    return [
        (frequency, letter, source)
        for letter, frequency in counter.items()
        if frequency > 1
    ]


def mix_string(letter_frequencies: list[tuple[int, str, str]]) -> str:
    return "/".join(
        f"{source}:{frequency*letter}"
        for frequency, letter, source in letter_frequencies
    )


def mix(s1: str, s2: str) -> str:
    seen_letters = set()
    letter_frequencies = frequencies(s1, "1") + frequencies(s2, "2")
    # sort by
    #   1. frequency (descending)
    #   2. letter (lexicographic ascending -> "a" < "b")
    letter_frequencies.sort(key=lambda x: (-x[0], x[1]))
    if not letter_frequencies:
        return ""
    stack = [letter_frequencies[0]]
    seen_letters.add(letter_frequencies[0][1])
    for frequency, letter, source in letter_frequencies[1:]:
        top = stack[-1]
        if frequency == top[0] and letter == top[1]:
            stack.pop()
            stack.append((frequency, letter, "="))
        elif letter in seen_letters:
            continue
        else:
            stack.append((frequency, letter, source))
            seen_letters.add(letter)
    # sort by
    #   1. frequency (descending)
    #   2. source (lexicographic ascending -> "1" < "2" < "=")
    #   3. letter (lexicographic ascending -> "a" < "b")
    stack.sort(key=lambda x: (-x[0], x[2], x[1]))
    return mix_string(stack)