[initial] first edition
This commit is contained in:
commit
ff73cc57f0
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
/target
|
||||
|
||||
corr*
|
||||
16
Cargo.lock
generated
Normal file
16
Cargo.lock
generated
Normal file
@ -0,0 +1,16 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "course_3"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"levenshtein",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "levenshtein"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
|
||||
7
Cargo.toml
Normal file
7
Cargo.toml
Normal file
@ -0,0 +1,7 @@
|
||||
[package]
|
||||
name = "course_3"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
levenshtein = "1.0.5"
|
||||
18
README.md
Normal file
18
README.md
Normal file
@ -0,0 +1,18 @@
|
||||
# Word Correction
|
||||
## YinMo19
|
||||
|
||||
A simple word correction program written in rust, while it is from a c course extra homework problem.
|
||||
It uses an algorithm to find the most similar word to the input word, which called Levenshtein distance.
|
||||
The parse of the input file is the most thing in the program do.
|
||||
The two big acceptance of the program is the levenshtein distance algorithm realization and the sort/binary_search of the word in dict.
|
||||
|
||||
You can just run
|
||||
```bash
|
||||
cargo build --release
|
||||
time ./target/release/word_correction
|
||||
```
|
||||
to test. In MacBook Air M2, My test result is
|
||||
```bash
|
||||
> time ./target/release/word_correction
|
||||
./target/release/word_correction 0.13s user 0.07s system 97% cpu 0.213 total
|
||||
```
|
||||
118
src/main.rs
Normal file
118
src/main.rs
Normal file
@ -0,0 +1,118 @@
|
||||
#![doc = include_str!("../README.md")]
|
||||
|
||||
use levenshtein::levenshtein;
|
||||
use std::fs::{File, read_to_string};
|
||||
use std::io::Write;
|
||||
|
||||
/// open a specified file and return a vector of strings
|
||||
/// where each element is a line.
|
||||
fn parse_line(file: &str) -> Vec<String> {
|
||||
read_to_string(file)
|
||||
.expect(format!("No {} found", file).as_str())
|
||||
.lines()
|
||||
.map(String::from)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Lines of a words.txt are like
|
||||
/// ```plaintext
|
||||
/// 1234 hello I/am/a/test/you/can
|
||||
/// 1231 correrify my/posibily/orrer
|
||||
/// ```
|
||||
/// We want to parse a line into a vector
|
||||
/// which elements represents each words,
|
||||
/// include first number.
|
||||
fn parse_words(file: &str) -> Vec<Vec<String>> {
|
||||
parse_line(file)
|
||||
.iter()
|
||||
.map(|word| {
|
||||
word.as_str()
|
||||
.split(&['/', ' '])
|
||||
.map(String::from)
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Binary-search first. If the word is NOT in the dictionary,
|
||||
/// we will find the word with the minimum distance.
|
||||
fn correrify<'a>(word: &'a str, dict: &'a Vec<String>) -> &'a str {
|
||||
if let Ok(_) = dict.binary_search(&word.to_string()) {
|
||||
return word;
|
||||
}
|
||||
|
||||
let mut temp_min = (usize::MAX, "");
|
||||
for check_word in dict.iter() {
|
||||
let distance = levenshtein(word, check_word.as_str());
|
||||
|
||||
if distance <= 1 {
|
||||
return check_word;
|
||||
}
|
||||
|
||||
if distance < temp_min.0 {
|
||||
temp_min = (distance, check_word);
|
||||
}
|
||||
}
|
||||
|
||||
temp_min.1
|
||||
}
|
||||
|
||||
/// The words's shape is just like
|
||||
/// ```
|
||||
/// [
|
||||
/// ["1324", "word1", "word2", "word3"],
|
||||
/// ["1325", "word1", "word2", "word3"],
|
||||
/// ]
|
||||
/// ```
|
||||
/// , and We can assert
|
||||
/// ```
|
||||
/// assert!(word_line[0].len() == 4);
|
||||
/// assert!(word_line[0].chars().all(|c| c.is_numeric()));
|
||||
/// ```
|
||||
/// We just skip the first word(4 digits number)
|
||||
/// and correrify the rest of words.
|
||||
fn select_word_correrify(words: &Vec<Vec<String>>, dict: &Vec<String>) -> Vec<Vec<String>> {
|
||||
words
|
||||
.iter()
|
||||
.map(|word_line| {
|
||||
assert!(word_line[0].len() == 4);
|
||||
assert!(word_line[0].chars().all(|c| c.is_numeric()));
|
||||
|
||||
vec![word_line[0].clone()]
|
||||
.into_iter()
|
||||
.chain(
|
||||
word_line
|
||||
.iter()
|
||||
.skip(1)
|
||||
.map(|word| correrify(word, dict).to_string())
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
.collect::<Vec<String>>()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// this function just write to the correrified_words.txt
|
||||
/// with same format as words.txt
|
||||
fn write_correrified_words(words: &Vec<Vec<String>>) {
|
||||
let mut file = File::create("correrified_words.txt").expect("Unable to create file");
|
||||
for word_line in words.iter() {
|
||||
writeln!(
|
||||
file,
|
||||
"{} {} {}",
|
||||
word_line[0],
|
||||
word_line[1],
|
||||
word_line[2..].join("/")
|
||||
)
|
||||
.expect("unable to write to file");
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut dict = parse_line("vocabulary.txt");
|
||||
dict.sort_unstable(); // sort the dictionary to accelerate the search
|
||||
let words = parse_words("words.txt");
|
||||
let word_correrified = select_word_correrify(&words, &dict);
|
||||
|
||||
write_correrified_words(&word_correrified);
|
||||
}
|
||||
3242
vocabulary.txt
Normal file
3242
vocabulary.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user