I'm writing some small programs to practice Rust, and this one's a string compressor for the golfing language Jelly (unofficial spec). It involves some non-ASCII stuff and bigints (from the num-bigint crate).
use std::io;
use std::io::Write;
use num_bigint::{BigUint, ToBigUint};
use num_traits::cast::ToPrimitive;
use std::fs;
use std::convert::TryFrom;
fn low_first(word: &str) -> String {
let mut word_chars = word.chars();
let first = word_chars.next().unwrap();
first.to_lowercase().to_string() + &word_chars.collect::<String>()
}
enum MightContain {
Contains(usize),
ContainsCaps(usize),
Missing
}
fn dict_contains(word: &String, dict: &Vec<&str>) -> MightContain {
let low_word = word.to_lowercase();
let low_first_word = low_first(word);
let mut low: usize = 0;
let mut high: usize = dict.len();
while high > low {
let mid = (low + high) / 2;
let d_word = dict[mid];
let low_d_word = d_word.to_lowercase();
if low_d_word < low_word {
low = mid + 1;
} else if low_d_word > low_word {
high = mid;
} else {
if word == d_word {
return MightContain::Contains(mid);
} else if low_first_word == low_first(d_word) {
return MightContain::ContainsCaps(mid);
} else {
return MightContain::Missing;
}
}
}
MightContain::Missing
}
fn calc_index(s: usize, ds: usize, p: usize, q: usize) -> usize {
usize::try_from(
isize::try_from(ds - s).unwrap() -
isize::try_from((((s - 1) - p) * (((s - 1) - p) + 1)) / 2).unwrap() -
isize::try_from(p).unwrap() +
isize::try_from(q).unwrap() - 2
).unwrap()
}
fn char_diagram(word: &String) -> String {
let mut diagram = String::from("(");
for char in word.chars() {
diagram.push(char);
diagram.push_str(")(");
}
diagram.pop();
diagram
}
#[derive(Clone)]
struct ModPair {
add: BigUint,
scale: BigUint
}
fn char_coding(word: &String) -> ModPair {
let mut add = 0u32.to_biguint().unwrap();
let mut scale = 1u32.to_biguint().unwrap();
for char in word.chars().rev() {
add = (add * 96u32.to_biguint().unwrap() + (if char == '\n' { 95 } else { (char as u32) - 32 }).to_biguint().unwrap()) * 3u32.to_biguint().unwrap();
scale *= (96 * 3).to_biguint().unwrap();
}
return ModPair {
add: add,
scale: scale
};
}
fn dict_coding(short_dict: bool, caps: bool, wrong_sp: bool, index: usize) -> ModPair {
let mut add = index.to_biguint().unwrap();
let mut scale = (if short_dict { 20453u32 } else { 227845u32 }).to_biguint().unwrap();
add = add * 2u32.to_biguint().unwrap() + (if short_dict { 1u32 } else { 0u32 }).to_biguint().unwrap();
scale *= 2u32.to_biguint().unwrap();
if caps || wrong_sp {
add = (add * 3u32.to_biguint().unwrap() + (if wrong_sp { if caps { 2u32 } else { 1u32 } } else { 0u32 }).to_biguint().unwrap()) * 3u32.to_biguint().unwrap() + 2u32.to_biguint().unwrap();
scale *= 9u32.to_biguint().unwrap();
} else {
add = add * 3u32.to_biguint().unwrap() + 1u32.to_biguint().unwrap();
scale *= 3u32.to_biguint().unwrap();
}
return ModPair {
add: add,
scale: scale
};
}
fn missing_coding(first: &ModPair, second: &ModPair) -> ModPair {
ModPair {
add: &second.add * &first.scale + &first.add,
scale: &first.scale * &second.scale
}
}
fn main() {
print!("ASCII: ");
let mut input_string = String::new();
io::stdout().flush().unwrap();
io::stdin().read_line(&mut input_string).unwrap();
print!("\n");
input_string.pop();
let mut string = String::new();
for char in input_string.chars() {
if char == '¶' {
string.push('\n');
continue;
}
if char < ' ' || char > '~' {
eprintln!("Contains non-ASCII");
std::process::exit(1);
}
string.push(char);
}
let raw_short_dict = fs::read_to_string("short.dict").expect("Could not read short.dict");
let raw_long_dict = fs::read_to_string("long.dict").expect("Could not read long.dict");
let short_dict: Vec<&str> = raw_short_dict.split("\n").collect();
let long_dict: Vec<&str> = raw_long_dict.split("\n").collect();
let string_length = string.chars().count();
let tri_string_length = string_length * (string_length + 1) / 2;
if string_length == 0 {
println!("Optimal cost: {} bits", 0.0);
println!("Diagram: {}", "");
return;
}
if string_length == 1 {
println!("Optimal cost: {} bits", f64::round(f64::log2(3.0 * 96.0) * 100.0) / 100.0);
println!("Diagram: {}", string);
return;
}
let mut optimal_costs: Vec<f64> = vec![0.0; tri_string_length - string_length];
let mut diagrams: Vec<String> = vec!["".to_string(); tri_string_length - string_length];
let mut codings: Vec<ModPair> = vec![ModPair {
add: 0u32.to_biguint().unwrap(),
scale: 0u32.to_biguint().unwrap()
}; tri_string_length - string_length];
for i in 2..(string_length + 1) {
for k in 0..(string_length - (i - 1)) {
let word = string.chars().skip(k).take(i).collect::<String>();
let no_sp_word;
if word.chars().next().unwrap() == ' ' {
no_sp_word = word[1..].to_string();
} else {
no_sp_word = word[..].to_string();
}
let mut cost = f64::log2(3.0 * 96.0) * (i as f64);
let mut diagram = char_diagram(&word);
let mut coding = char_coding(&word);
let contains = if i < 60 { dict_contains(&no_sp_word, if no_sp_word.len() <= 5 { &short_dict } else { &long_dict }) } else { MightContain::Missing };
match contains {
MightContain::Contains(dict_index) => {
let word_cost = f64::log2(3.0 * 2.0 * (if (word.chars().next().unwrap() == ' ') == (k == 0) { 3.0 } else { 1.0 }) * (if no_sp_word.len() <= 5 { 20453.0 } else { 227845.0 }));
if word_cost < cost {
cost = word_cost;
diagram = format!("({})", word);
coding = dict_coding(no_sp_word.len() <= 5, false, (word.chars().next().unwrap() == ' ') == (k == 0), dict_index);
}
},
MightContain::ContainsCaps(dict_index) => {
let word_cost = f64::log2(3.0 * 2.0 * 3.0 * (if no_sp_word.len() <= 5 { 20453.0 } else { 227845.0 }));
if word_cost < cost {
cost = word_cost;
diagram = format!("({})", word);
coding = dict_coding(no_sp_word.len() <= 5, true, (word.chars().next().unwrap() == ' ') == (k == 0), dict_index);
}
},
MightContain::Missing => {
if i >= 3 {
let first_index = calc_index(string_length, tri_string_length, k + 1, k + i);
let first_cost = f64::log2(3.0 * 96.0) + optimal_costs[first_index];
if first_cost < cost {
cost = first_cost;
diagram = format!("({}){}", word.chars().next().unwrap(), diagrams[first_index]);
coding = missing_coding(&char_coding(&String::from(word.chars().next().unwrap())), &codings[first_index]);
}
let second_index = calc_index(string_length, tri_string_length, k, (k + i) - 1);
let second_cost = optimal_costs[second_index] + f64::log2(3.0 * 96.0);
if second_cost < cost {
cost = second_cost;
diagram = format!("{}({})", diagrams[second_index], word.chars().last().unwrap());
coding = missing_coding(&codings[second_index], &char_coding(&String::from(word.chars().last().unwrap())));
}
}
if i >= 4 {
for split in 2..((i + 1) - 2) {
let first_index = calc_index(string_length, tri_string_length, k, k + split);
let second_index = calc_index(string_length, tri_string_length, k + split, k + i);
let split_cost = optimal_costs[first_index] + optimal_costs[second_index];
if split_cost < cost {
cost = split_cost;
diagram = format!("{}{}", diagrams[first_index], diagrams[second_index]);
coding = missing_coding(&codings[first_index], &codings[second_index]);
}
// cost = cmp::min_by(cost, , |x, y| x.partial_cmp(y).unwrap())
}
}
}
}
let index = calc_index(string_length, tri_string_length, k, k + i);
optimal_costs[index] = cost;
diagrams[index] = diagram;
codings[index] = coding;
}
}
let string_index = calc_index(string_length, tri_string_length, 0, string_length);
println!("Optimal cost: {} bits", f64::round(optimal_costs[string_index] * 100.0) / 100.0);
println!("Diagram: {}", diagrams[string_index]);
print!("\n");
let code_page = concat!(
"¡¢£¤¥¦©¬®µ½¿€ÆÇÐÑרŒÞßæçðıȷñ÷øœþ !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~¶",
"°¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ƁƇƊƑƓƘⱮƝƤƬƲȤɓƈɗƒɠɦƙɱɲƥʠɼʂƭʋȥẠḄḌẸḤỊḲḶṂṆỌṚṢṬỤṾẈỴẒȦḂĊḊĖḞĠḢİĿṀṄȮṖṘṠṪẆẊẎŻạḅḍẹḥịḳḷṃṇọṛṣṭ§Äẉỵẓȧḃċḋėḟġḣŀṁṅȯṗṙṡṫẇẋẏż"
);
let mut final_string = String::new();
let mut n = codings[string_index].add.clone();
while n != 0u32.to_biguint().unwrap() {
let mut digit = &n % 250u32.to_biguint().unwrap();
if digit == 0u32.to_biguint().unwrap() {
digit = 250u32.to_biguint().unwrap();
}
n = (&n - &digit) / 250u32.to_biguint().unwrap();
digit -= 1u32.to_biguint().unwrap();
final_string.push(code_page.chars().collect::<Vec<char>>()[digit.to_usize().unwrap()]);
}
println!("String: {}", final_string.chars().rev().collect::<String>());
}
It reads a dictionary of words from files called short.dict and long.dict (you can find those here if you want to run it yourself), and its dependencies are num-bigint 0.4.3 and num-traits 0.2.11. When you run it, you just type some text, and it'll show you how it's grouped into words and what the resulting compressed output is. I think it's approximately O(n³), but it's reasonably fast for inputs up to around a half kilobyte.
I'm pretty new to Rust so feedback on any parts of it would be helpful, but in particular:
- Is there a better way to initialize the vector
codings? Currently I fill it with uselessModPairs which get overwritten later, which feels like a waste of time and memory, but usingVec::with_capacity(...)would cause errors when I tried to use the vector (since the capacity was right but the length was0) - Is there a better way to do math with signed and unsigned number types in things like the
calc_indexfunction? I have to do a ton oftry_froms andunwraps, since subtracting theusizes sometimes would result in negative numbers in an in-between step (but the result is always positive, so I have totry_fromandunwrapto ausizeagain) - Is there a more elegant way to do math with bigints? For every constant I add or multiply to a bigint, I have to do something like
3u32.to_biguint().unwrap(), which is really clumsy and verbose
Thanks!
Contains non-ASCIIotherwise, though I didn't test that. \$\endgroup\$