[initial] first edition
This commit is contained in:
commit
ff73cc57f0
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
/target
|
||||||
|
|
||||||
|
corr*
|
||||||
16
Cargo.lock
generated
Normal file
16
Cargo.lock
generated
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
version = 4
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "course_3"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"levenshtein",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "levenshtein"
|
||||||
|
version = "1.0.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
|
||||||
7
Cargo.toml
Normal file
7
Cargo.toml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
[package]
|
||||||
|
name = "course_3"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
levenshtein = "1.0.5"
|
||||||
18
README.md
Normal file
18
README.md
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# Word Correction
|
||||||
|
## YinMo19
|
||||||
|
|
||||||
|
A simple word correction program written in rust, while it is from a c course extra homework problem.
|
||||||
|
It uses an algorithm to find the most similar word to the input word, which called Levenshtein distance.
|
||||||
|
The parse of the input file is the most thing in the program do.
|
||||||
|
The two big acceptance of the program is the levenshtein distance algorithm realization and the sort/binary_search of the word in dict.
|
||||||
|
|
||||||
|
You can just run
|
||||||
|
```bash
|
||||||
|
cargo build --release
|
||||||
|
time ./target/release/word_correction
|
||||||
|
```
|
||||||
|
to test. In MacBook Air M2, My test result is
|
||||||
|
```bash
|
||||||
|
> time ./target/release/word_correction
|
||||||
|
./target/release/word_correction 0.13s user 0.07s system 97% cpu 0.213 total
|
||||||
|
```
|
||||||
118
src/main.rs
Normal file
118
src/main.rs
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
#![doc = include_str!("../README.md")]
|
||||||
|
|
||||||
|
use levenshtein::levenshtein;
|
||||||
|
use std::fs::{File, read_to_string};
|
||||||
|
use std::io::Write;
|
||||||
|
|
||||||
|
/// open a specified file and return a vector of strings
|
||||||
|
/// where each element is a line.
|
||||||
|
fn parse_line(file: &str) -> Vec<String> {
|
||||||
|
read_to_string(file)
|
||||||
|
.expect(format!("No {} found", file).as_str())
|
||||||
|
.lines()
|
||||||
|
.map(String::from)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lines of a words.txt are like
|
||||||
|
/// ```plaintext
|
||||||
|
/// 1234 hello I/am/a/test/you/can
|
||||||
|
/// 1231 correrify my/posibily/orrer
|
||||||
|
/// ```
|
||||||
|
/// We want to parse a line into a vector
|
||||||
|
/// which elements represents each words,
|
||||||
|
/// include first number.
|
||||||
|
fn parse_words(file: &str) -> Vec<Vec<String>> {
|
||||||
|
parse_line(file)
|
||||||
|
.iter()
|
||||||
|
.map(|word| {
|
||||||
|
word.as_str()
|
||||||
|
.split(&['/', ' '])
|
||||||
|
.map(String::from)
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Binary-search first. If the word is NOT in the dictionary,
|
||||||
|
/// we will find the word with the minimum distance.
|
||||||
|
fn correrify<'a>(word: &'a str, dict: &'a Vec<String>) -> &'a str {
|
||||||
|
if let Ok(_) = dict.binary_search(&word.to_string()) {
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut temp_min = (usize::MAX, "");
|
||||||
|
for check_word in dict.iter() {
|
||||||
|
let distance = levenshtein(word, check_word.as_str());
|
||||||
|
|
||||||
|
if distance <= 1 {
|
||||||
|
return check_word;
|
||||||
|
}
|
||||||
|
|
||||||
|
if distance < temp_min.0 {
|
||||||
|
temp_min = (distance, check_word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
temp_min.1
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The words's shape is just like
|
||||||
|
/// ```
|
||||||
|
/// [
|
||||||
|
/// ["1324", "word1", "word2", "word3"],
|
||||||
|
/// ["1325", "word1", "word2", "word3"],
|
||||||
|
/// ]
|
||||||
|
/// ```
|
||||||
|
/// , and We can assert
|
||||||
|
/// ```
|
||||||
|
/// assert!(word_line[0].len() == 4);
|
||||||
|
/// assert!(word_line[0].chars().all(|c| c.is_numeric()));
|
||||||
|
/// ```
|
||||||
|
/// We just skip the first word(4 digits number)
|
||||||
|
/// and correrify the rest of words.
|
||||||
|
fn select_word_correrify(words: &Vec<Vec<String>>, dict: &Vec<String>) -> Vec<Vec<String>> {
|
||||||
|
words
|
||||||
|
.iter()
|
||||||
|
.map(|word_line| {
|
||||||
|
assert!(word_line[0].len() == 4);
|
||||||
|
assert!(word_line[0].chars().all(|c| c.is_numeric()));
|
||||||
|
|
||||||
|
vec![word_line[0].clone()]
|
||||||
|
.into_iter()
|
||||||
|
.chain(
|
||||||
|
word_line
|
||||||
|
.iter()
|
||||||
|
.skip(1)
|
||||||
|
.map(|word| correrify(word, dict).to_string())
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
)
|
||||||
|
.collect::<Vec<String>>()
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// this function just write to the correrified_words.txt
|
||||||
|
/// with same format as words.txt
|
||||||
|
fn write_correrified_words(words: &Vec<Vec<String>>) {
|
||||||
|
let mut file = File::create("correrified_words.txt").expect("Unable to create file");
|
||||||
|
for word_line in words.iter() {
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
"{} {} {}",
|
||||||
|
word_line[0],
|
||||||
|
word_line[1],
|
||||||
|
word_line[2..].join("/")
|
||||||
|
)
|
||||||
|
.expect("unable to write to file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let mut dict = parse_line("vocabulary.txt");
|
||||||
|
dict.sort_unstable(); // sort the dictionary to accelerate the search
|
||||||
|
let words = parse_words("words.txt");
|
||||||
|
let word_correrified = select_word_correrify(&words, &dict);
|
||||||
|
|
||||||
|
write_correrified_words(&word_correrified);
|
||||||
|
}
|
||||||
3242
vocabulary.txt
Normal file
3242
vocabulary.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user