[initial] first edition

This commit is contained in:
YinMo19 2025-11-20 16:04:09 +08:00
commit ff73cc57f0
7 changed files with 9571 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/target
corr*

16
Cargo.lock generated Normal file
View File

@ -0,0 +1,16 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "course_3"
version = "0.1.0"
dependencies = [
"levenshtein",
]
[[package]]
name = "levenshtein"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"

7
Cargo.toml Normal file
View File

@ -0,0 +1,7 @@
[package]
name = "course_3"
version = "0.1.0"
edition = "2024"
[dependencies]
levenshtein = "1.0.5"

18
README.md Normal file
View File

@ -0,0 +1,18 @@
# Word Correction
## YinMo19
A simple word correction program written in rust, while it is from a c course extra homework problem.
It uses an algorithm to find the most similar word to the input word, which called Levenshtein distance.
The parse of the input file is the most thing in the program do.
The two big acceptance of the program is the levenshtein distance algorithm realization and the sort/binary_search of the word in dict.
You can just run
```bash
cargo build --release
time ./target/release/word_correction
```
to test. In MacBook Air M2, My test result is
```bash
> time ./target/release/word_correction
./target/release/word_correction 0.13s user 0.07s system 97% cpu 0.213 total
```

118
src/main.rs Normal file
View File

@ -0,0 +1,118 @@
#![doc = include_str!("../README.md")]
use levenshtein::levenshtein;
use std::fs::{File, read_to_string};
use std::io::Write;
/// open a specified file and return a vector of strings
/// where each element is a line.
fn parse_line(file: &str) -> Vec<String> {
read_to_string(file)
.expect(format!("No {} found", file).as_str())
.lines()
.map(String::from)
.collect()
}
/// Lines of a words.txt are like
/// ```plaintext
/// 1234 hello I/am/a/test/you/can
/// 1231 correrify my/posibily/orrer
/// ```
/// We want to parse a line into a vector
/// which elements represents each words,
/// include first number.
fn parse_words(file: &str) -> Vec<Vec<String>> {
parse_line(file)
.iter()
.map(|word| {
word.as_str()
.split(&['/', ' '])
.map(String::from)
.collect::<Vec<_>>()
})
.collect()
}
/// Binary-search first. If the word is NOT in the dictionary,
/// we will find the word with the minimum distance.
fn correrify<'a>(word: &'a str, dict: &'a Vec<String>) -> &'a str {
if let Ok(_) = dict.binary_search(&word.to_string()) {
return word;
}
let mut temp_min = (usize::MAX, "");
for check_word in dict.iter() {
let distance = levenshtein(word, check_word.as_str());
if distance <= 1 {
return check_word;
}
if distance < temp_min.0 {
temp_min = (distance, check_word);
}
}
temp_min.1
}
/// The words's shape is just like
/// ```
/// [
/// ["1324", "word1", "word2", "word3"],
/// ["1325", "word1", "word2", "word3"],
/// ]
/// ```
/// , and We can assert
/// ```
/// assert!(word_line[0].len() == 4);
/// assert!(word_line[0].chars().all(|c| c.is_numeric()));
/// ```
/// We just skip the first word(4 digits number)
/// and correrify the rest of words.
fn select_word_correrify(words: &Vec<Vec<String>>, dict: &Vec<String>) -> Vec<Vec<String>> {
words
.iter()
.map(|word_line| {
assert!(word_line[0].len() == 4);
assert!(word_line[0].chars().all(|c| c.is_numeric()));
vec![word_line[0].clone()]
.into_iter()
.chain(
word_line
.iter()
.skip(1)
.map(|word| correrify(word, dict).to_string())
.collect::<Vec<_>>(),
)
.collect::<Vec<String>>()
})
.collect()
}
/// this function just write to the correrified_words.txt
/// with same format as words.txt
fn write_correrified_words(words: &Vec<Vec<String>>) {
let mut file = File::create("correrified_words.txt").expect("Unable to create file");
for word_line in words.iter() {
writeln!(
file,
"{} {} {}",
word_line[0],
word_line[1],
word_line[2..].join("/")
)
.expect("unable to write to file");
}
}
fn main() {
let mut dict = parse_line("vocabulary.txt");
dict.sort_unstable(); // sort the dictionary to accelerate the search
let words = parse_words("words.txt");
let word_correrified = select_word_correrify(&words, &dict);
write_correrified_words(&word_correrified);
}

3242
vocabulary.txt Normal file

File diff suppressed because it is too large Load Diff

6167
words.txt Normal file

File diff suppressed because it is too large Load Diff