Optimal Jelly string compressor in Rust

Question

I'm writing some small programs to practice Rust, and this one's a string compressor for the golfing language Jelly (unofficial spec). It involves some non-ASCII stuff and bigints (from the num-bigint crate).

use std::io;
use std::io::Write;
use num_bigint::{BigUint, ToBigUint};
use num_traits::cast::ToPrimitive;
use std::fs;
use std::convert::TryFrom;

fn low_first(word: &str) -> String {
    let mut word_chars = word.chars();
    
    let first = word_chars.next().unwrap();
    
    first.to_lowercase().to_string() + &word_chars.collect::<String>()
}

enum MightContain {
    Contains(usize),
    ContainsCaps(usize),
    Missing
}

fn dict_contains(word: &String, dict: &Vec<&str>) -> MightContain {
    let low_word = word.to_lowercase();
    let low_first_word = low_first(word);
    
    let mut low: usize = 0;
    let mut high: usize = dict.len();

    while high > low {
        let mid = (low + high) / 2;

        let d_word = dict[mid];
        let low_d_word = d_word.to_lowercase();

        if low_d_word < low_word {
            low = mid + 1;
        } else if low_d_word > low_word {
            high = mid;
        } else {
            if word == d_word {
                return MightContain::Contains(mid);
            } else if low_first_word == low_first(d_word) {
                return MightContain::ContainsCaps(mid);
            } else {
                return MightContain::Missing;
            }
        }
    }

    MightContain::Missing
}

fn calc_index(s: usize, ds: usize, p: usize, q: usize) -> usize {
    usize::try_from(
        isize::try_from(ds - s).unwrap() -
        isize::try_from((((s - 1) - p) * (((s - 1) - p) + 1)) / 2).unwrap() -
        isize::try_from(p).unwrap() +
        isize::try_from(q).unwrap() - 2
    ).unwrap()
}

fn char_diagram(word: &String) -> String {
    let mut diagram = String::from("(");
    
    for char in word.chars() {
        diagram.push(char);
        diagram.push_str(")(");
    }
    
    diagram.pop();
    
    diagram
}

#[derive(Clone)]
struct ModPair {
    add: BigUint,
    scale: BigUint
}

fn char_coding(word: &String) -> ModPair {
    let mut add = 0u32.to_biguint().unwrap();
    let mut scale = 1u32.to_biguint().unwrap();
    
    for char in word.chars().rev() {
        add = (add * 96u32.to_biguint().unwrap() + (if char == '\n' { 95 } else { (char as u32) - 32 }).to_biguint().unwrap()) * 3u32.to_biguint().unwrap();
        scale *= (96 * 3).to_biguint().unwrap();
    }
    
    return ModPair {
        add: add,
        scale: scale
    };
}

fn dict_coding(short_dict: bool, caps: bool, wrong_sp: bool, index: usize) -> ModPair {
    let mut add = index.to_biguint().unwrap();
    let mut scale = (if short_dict { 20453u32 } else { 227845u32 }).to_biguint().unwrap();
    
    add = add * 2u32.to_biguint().unwrap() + (if short_dict { 1u32 } else { 0u32 }).to_biguint().unwrap();
    scale *= 2u32.to_biguint().unwrap();
    
    if caps || wrong_sp {
        add = (add * 3u32.to_biguint().unwrap() + (if wrong_sp { if caps { 2u32 } else { 1u32 } } else { 0u32 }).to_biguint().unwrap()) * 3u32.to_biguint().unwrap() + 2u32.to_biguint().unwrap();
        scale *= 9u32.to_biguint().unwrap();
    } else {
        add = add * 3u32.to_biguint().unwrap() + 1u32.to_biguint().unwrap();
        scale *= 3u32.to_biguint().unwrap();
    }
    
    return ModPair {
        add: add,
        scale: scale
    };
}

fn missing_coding(first: &ModPair, second: &ModPair) -> ModPair {
    ModPair {
        add: &second.add * &first.scale + &first.add,
        scale: &first.scale * &second.scale
    }
}

fn main() {
    print!("ASCII: ");
    
    let mut input_string = String::new();
    
    io::stdout().flush().unwrap();
    
    io::stdin().read_line(&mut input_string).unwrap();
    
    print!("\n");
    
    input_string.pop();
    
    let mut string = String::new();
    
    for char in input_string.chars() {
        if char == '¶' {
            string.push('\n');
            
            continue;
        }
        
        if char < ' ' || char > '~' {
            eprintln!("Contains non-ASCII");
            
            std::process::exit(1);
        }
        
        string.push(char);
    }
    
    let raw_short_dict = fs::read_to_string("short.dict").expect("Could not read short.dict");
    let raw_long_dict = fs::read_to_string("long.dict").expect("Could not read long.dict");
    
    let short_dict: Vec<&str> = raw_short_dict.split("\n").collect();
    let long_dict: Vec<&str> = raw_long_dict.split("\n").collect();
    
    let string_length = string.chars().count();
    let tri_string_length = string_length * (string_length + 1) / 2;
    
    if string_length == 0 {
        println!("Optimal cost: {} bits", 0.0);
        println!("Diagram: {}", "");
        
        return;
    }
    
    if string_length == 1 {
        println!("Optimal cost: {} bits", f64::round(f64::log2(3.0 * 96.0) * 100.0) / 100.0);
        println!("Diagram: {}", string);
        
        return;
    }
    
    let mut optimal_costs: Vec<f64> = vec![0.0; tri_string_length - string_length];
    let mut diagrams: Vec<String> = vec!["".to_string(); tri_string_length - string_length];
    let mut codings: Vec<ModPair> = vec![ModPair {
        add: 0u32.to_biguint().unwrap(),
        scale: 0u32.to_biguint().unwrap()
    }; tri_string_length - string_length];
    
    for i in 2..(string_length + 1) {
        for k in 0..(string_length - (i - 1)) {
            let word = string.chars().skip(k).take(i).collect::<String>();
            
            let no_sp_word;
            
            if word.chars().next().unwrap() == ' ' {
                no_sp_word = word[1..].to_string();
            } else {
                no_sp_word = word[..].to_string();
            }
            
            let mut cost = f64::log2(3.0 * 96.0) * (i as f64);
            let mut diagram = char_diagram(&word);
            let mut coding = char_coding(&word);
            
            let contains = if i < 60 { dict_contains(&no_sp_word, if no_sp_word.len() <= 5 { &short_dict } else { &long_dict }) } else { MightContain::Missing };
            
            match contains {
                MightContain::Contains(dict_index) => {
                    let word_cost = f64::log2(3.0 * 2.0 * (if (word.chars().next().unwrap() == ' ') == (k == 0) { 3.0 } else { 1.0 }) * (if no_sp_word.len() <= 5 { 20453.0 } else { 227845.0 }));

                    if word_cost < cost {
                        cost = word_cost;
                        diagram = format!("({})", word);
                        coding = dict_coding(no_sp_word.len() <= 5, false, (word.chars().next().unwrap() == ' ') == (k == 0), dict_index);
                    }
                },
                MightContain::ContainsCaps(dict_index) => {
                    let word_cost = f64::log2(3.0 * 2.0 * 3.0 * (if no_sp_word.len() <= 5 { 20453.0 } else { 227845.0 }));

                    if word_cost < cost {
                        cost = word_cost;
                        diagram = format!("({})", word);
                        coding = dict_coding(no_sp_word.len() <= 5, true, (word.chars().next().unwrap() == ' ') == (k == 0), dict_index);
                    }
                },
                MightContain::Missing => {
                    if i >= 3 {
                        let first_index = calc_index(string_length, tri_string_length, k + 1, k + i);
                        let first_cost = f64::log2(3.0 * 96.0) + optimal_costs[first_index];

                        if first_cost < cost {
                            cost = first_cost;
                            diagram = format!("({}){}", word.chars().next().unwrap(), diagrams[first_index]);
                            coding = missing_coding(&char_coding(&String::from(word.chars().next().unwrap())), &codings[first_index]);
                        }

                        let second_index = calc_index(string_length, tri_string_length, k, (k + i) - 1);
                        let second_cost = optimal_costs[second_index] + f64::log2(3.0 * 96.0);

                        if second_cost < cost {
                            cost = second_cost;
                            diagram = format!("{}({})", diagrams[second_index], word.chars().last().unwrap());
                            coding = missing_coding(&codings[second_index], &char_coding(&String::from(word.chars().last().unwrap())));
                        }
                    }

                    if i >= 4 {
                        for split in 2..((i + 1) - 2) {
                            let first_index = calc_index(string_length, tri_string_length, k, k + split);
                            let second_index = calc_index(string_length, tri_string_length, k + split, k + i);

                            let split_cost = optimal_costs[first_index] + optimal_costs[second_index];

                            if split_cost < cost {
                                cost = split_cost;
                                diagram = format!("{}{}", diagrams[first_index], diagrams[second_index]);
                                coding = missing_coding(&codings[first_index], &codings[second_index]);
                            }

                            // cost = cmp::min_by(cost, , |x, y| x.partial_cmp(y).unwrap())
                        }
                    }
                }
            }
            
            let index = calc_index(string_length, tri_string_length, k, k + i);
            
            optimal_costs[index] = cost;
            diagrams[index] = diagram;
            codings[index] = coding;
        }
    }
    
    let string_index = calc_index(string_length, tri_string_length, 0, string_length);
    
    println!("Optimal cost: {} bits", f64::round(optimal_costs[string_index] * 100.0) / 100.0);
    println!("Diagram: {}", diagrams[string_index]);
    
    print!("\n");
    
    let code_page = concat!(
        "¡¢£¤¥¦©¬®µ½¿€ÆÇÐÑ×ØŒÞßæçðıȷñ÷øœþ !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~¶",
        "°¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ƁƇƊƑƓƘⱮƝƤƬƲȤɓƈɗƒɠɦƙɱɲƥʠɼʂƭʋȥẠḄḌẸḤỊḲḶṂṆỌṚṢṬỤṾẈỴẒȦḂĊḊĖḞĠḢİĿṀṄȮṖṘṠṪẆẊẎŻạḅḍẹḥịḳḷṃṇọṛṣṭ§Äẉỵẓȧḃċḋėḟġḣŀṁṅȯṗṙṡṫẇẋẏż"
    );
    
    let mut final_string = String::new();
    
    let mut n = codings[string_index].add.clone();
    
    while n != 0u32.to_biguint().unwrap() {
        let mut digit = &n % 250u32.to_biguint().unwrap();
        
        if digit == 0u32.to_biguint().unwrap() {
            digit = 250u32.to_biguint().unwrap();
        }
        
        n = (&n - &digit) / 250u32.to_biguint().unwrap();
        
        digit -= 1u32.to_biguint().unwrap();
        
        final_string.push(code_page.chars().collect::<Vec<char>>()[digit.to_usize().unwrap()]);
    }
    
    println!("String: {}", final_string.chars().rev().collect::<String>());
}

It reads a dictionary of words from files called short.dict and long.dict (you can find those here if you want to run it yourself), and its dependencies are num-bigint 0.4.3 and num-traits 0.2.11. When you run it, you just type some text, and it'll show you how it's grouped into words and what the resulting compressed output is. I think it's approximately O(n³), but it's reasonably fast for inputs up to around a half kilobyte.

I'm pretty new to Rust so feedback on any parts of it would be helpful, but in particular:

Is there a better way to initialize the vector codings? Currently I fill it with useless ModPairs which get overwritten later, which feels like a waste of time and memory, but using Vec::with_capacity(...) would cause errors when I tried to use the vector (since the capacity was right but the length was 0)
Is there a better way to do math with signed and unsigned number types in things like the calc_index function? I have to do a ton of try_froms and unwraps, since subtracting the usizes sometimes would result in negative numbers in an in-between step (but the result is always positive, so I have to try_from and unwrap to a usize again)
Is there a more elegant way to do math with bigints? For every constant I add or multiply to a bigint, I have to do something like 3u32.to_biguint().unwrap(), which is really clumsy and verbose

Thanks!

@L.F. Yes. It should print something like Contains non-ASCII otherwise, though I didn't test that. — rydwolf
– rydwolf, Commented Jun 28, 2022 at 16:50

L. F. · Accepted Answer · 2022-06-29 04:57:21Z

Q & A

Is there a better way to initialize the vector codings? Currently I fill it with useless ModPairs which get overwritten later, which feels like a waste of time and memory, but using Vec::with_capacity(...) would cause errors when I tried to use the vector (since the capacity was right but the length was 0)

let codings = Vec::with_capacity(/* ... */);

for i in /* ... */ {
    // instead of `codings[i] = /* ... */`
    codings.push(/* ... */);
}

Is there a better way to do math with signed and unsigned number types in things like the calc_index function? I have to do a ton of try_froms and unwraps, since subtracting the usizes sometimes would result in negative numbers in an in-between step (but the result is always positive, so I have to try_from and unwrap to a usize again)

Take calc_index for example. The formula is a bit beyond my understanding, but I think you can avoid negative values by rearranging the terms such that addition happens before subtraction (if overflow isn't an issue):

fn calc_index(s: usize, ds: usize, p: usize, q: usize) -> usize {
    q + ds - s - ((((s - 1) - p) * (((s - 1) - p) + 1)) / 2) - p - 2
}

Is there a more elegant way to do math with bigints? For every constant I add or multiply to a bigint, I have to do something like 3u32.to_biguint().unwrap(), which is really clumsy and verbose

Operations such as BigUint * u32 are supported. See the section on char_coding.

Formatting

Running rustfmt (via cargo fmt) formats the code according to the official Rust Style Guide.

Personally, I like to merge use declarations:

use {
    num_bigint::{BigUint, ToBigUint},
    num_traits::cast::ToPrimitive,
    std::{
        convert::TryFrom,
        fs,
        io::{self, Write},
    },
};

Clippy

cargo clippy gives the following suggestions:

print!("\n") becomes println!(), and println!("Diagram: {}", "") is just println!("Diagram: ").
else { if .. } can be collapsed into else if ...
ModPair { add: add, scale: scale } can be replaced with the shorthand ModPair { add, scale }.
Functions should generally take &str instead of &String. See [Why is it discouraged to accept a reference to a String (&String), Vec (&Vec), or Box (&Box) as a function argument?]
Explicit return is unnecessary at the end of a block.
Clippy suggests rewriting char < ' ' || char > '~' as !(' '..='~').contains(&char).
.split("\n") can be .split('\n').
word.chars().next().unwrap() == ' ' can be simplified to word.starts_with(' ') (except the latter does not panic if word.is_empty()).

`low_first`

fn low_first(word: &str) -> String {
    let mut word_chars = word.chars();

    let first = word_chars.next().unwrap();

    first.to_lowercase().to_string() + &word_chars.collect::<String>()
}

collect allocates a String, which is appended to another String; quite wasteful (in theory, at least). In fact, Chars (the type of word.chars()) provide a method as_str that exposes its &str representation:

first.to_lowercase().to_string() + word_chars.as_str()

`dict_contains`

Take word: &str, dict: &[&str] as arguments instead.

We can use binary_search_by_key to simplify the function:

fn dict_contains(word: &str, dict: &[&str]) -> MightContain {
    let low_word = word.to_ascii_lowercase();
    let low_first_word = low_first(word);

    let pos = match dict.binary_search_by_key(&low_word, |s| s.to_ascii_lowercase()) {
        Ok(pos) => pos,
        Err(_) => return MightContain::Missing,
    };
    let d_word = dict[pos];

    if word == d_word {
        MightContain::Contains(pos)
    } else if low_first_word == low_first(d_word) {
        MightContain::ContainsCaps(pos)
    } else {
        MightContain::Missing
    }
}

Note that to_ascii_lowercase is better than to_lowercase if the string is known to be ASCII-only.

`char_diagram`

More intuitively:

fn char_diagram(word: &String) -> String {
    word.chars().flat_map(|c| ['(', c, ')']).collect()
}

`char_coding`

96u32.to_biguint().unwrap() can be simplified to BigUint::from(96_u32). Eliminating these magic constants helps clarify their meanings. Also, I find *= and += more readable here:

fn char_coding(word: &String) -> ModPair {
    use num_traits::{One, Zero};

    let mut add = BigUint::zero();
    let mut scale = BigUint::one();

    // better names welcome
    const ASCII_OFFSET: u32 = 32;
    const CHAR_RADIX: u32 = 96;
    const CODE_RADIX: u32 = 3;

    for c in word.chars().rev() {
        add *= CHAR_RADIX;
        add += match c {
            '\n' => CHAR_RADIX - 1,
            _ => c as u32 - ASCII_OFFSET,
        };
        add *= CODE_RADIX;

        scale *= CHAR_RADIX * CODE_RADIX;
    }

    ModPair { add, scale }
}

Note that BigUint supports built-in unsigned integer types on the right-hand side of arithmetic operators. I also renamed char to c since char is a built-in type name.

Similar comments apply to dict_coding.

Printing

print!("ASCII: ");
// ...
io::stdout().flush().unwrap();
// ...
print!("\n");

I recommended printing the prompt to stderr, which not only eliminates the need to manually flush but also allows the user to separate actual output from prompts by separating stdout and stderr:

eprint!("ASCII: ");

Pre-processing input

let mut string = String::new();

for char in input_string.chars() {
    if char == '¶' {
        string.push('\n');

        continue;
    }

    if char < ' ' || char > '~' {
        eprintln!("Contains non-ASCII");

        std::process::exit(1);
    }

    string.push(char);
}

How about

let string = input_string.replace('¶', "\n");
if string.chars().any(|c| !(' '..='~').contains(&c)) {
    eprintln!("Contains non-ASCII");
    std::process::exit(1);
}

Other parts of `main`

main is a very long function.

It would be more convenient if the paths to the dictionaries are provided as arguments.

2..(string_length + 1) is 2..=string_length.

string.chars().skip(k).take(i).collect::<String> is string[k..(k + i)].to_owned() since string is ASCII-only.

Instead of

let no_sp_word;

if word.chars().next().unwrap() == ' ' {
    no_sp_word = word[1..].to_string();
} else {
    no_sp_word = word[..].to_string();
}

we have

let no_sp_word = if word.starts_with(' ') {
    &word[1..]
} else {
    &word[..]
};

(note that allocation is unnecessary).

Thanks, there's a lot of helpful information here! I don't think the suggestion of pushing to codings would work since I don't insert items in order, but maybe I could push to a second Vec, and use a Vec<usize> to map between the indices. — rydwolf
– rydwolf, Commented Jun 29, 2022 at 15:42
@RadvylfPrograms Oops - didn't notice that the insertions are out of order. In that case, I would stick with the current version, since initializing with all zero values is probably going to compile to a simple memset and unlikely to cause much overhead anyway. If the elements are sparse enough, switching to another data structure (such as HashMap) might also be an option. — L. F.
– L. F., Commented Jun 29, 2022 at 15:47

Stack Exchange Network

Optimal Jelly string compressor in Rust

1 Answer 1

Q & A

Formatting

Clippy

`low_first`

`dict_contains`

`char_diagram`

`char_coding`

Printing

Pre-processing input

Other parts of `main`

You must log in to answer this question.

Hot Network Questions

Optimal Jelly string compressor in Rust

1 Answer 1

Q & A

Formatting

Clippy

low_first

dict_contains

char_diagram

char_coding

Printing

Pre-processing input

Other parts of main

You must log in to answer this question.

Related

Hot Network Questions

`low_first`

`dict_contains`

`char_diagram`

`char_coding`

Other parts of `main`