[initial] first edition

2025-11-20 16:04:09 +08:00 · 2025-11-20 16:04:09 +08:00 · ff73cc57f0
commit ff73cc57f0
7 changed files with 9571 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+/target
+
+corr*
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,16 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "course_3"
+version = "0.1.0"
+dependencies = [
+ "levenshtein",
+]
+
+[[package]]
+name = "levenshtein"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,7 @@
+[package]
+name = "course_3"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+levenshtein = "1.0.5"
--- a/README.md
+++ b/README.md
@ -0,0 +1,18 @@
+# Word Correction
+## YinMo19
+
+A simple word correction program written in rust, while it is from a c course extra homework problem.
+It uses an algorithm to find the most similar word to the input word, which called Levenshtein distance.
+The parse of the input file is the most thing in the program do.
+The two big acceptance of the program is the levenshtein distance algorithm realization and the sort/binary_search of the word in dict.
+
+You can just run 
+```bash
+cargo build --release
+time ./target/release/word_correction
+```
+to test. In MacBook Air M2, My test result is 
+```bash
+> time ./target/release/word_correction 
+./target/release/word_correction  0.13s user 0.07s system 97% cpu 0.213 total
+```
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,118 @@
+#![doc = include_str!("../README.md")]
+
+use levenshtein::levenshtein;
+use std::fs::{File, read_to_string};
+use std::io::Write;
+
+/// open a specified file and return a vector of strings
+/// where each element is a line.
+fn parse_line(file: &str) -> Vec<String> {
+    read_to_string(file)
+        .expect(format!("No {} found", file).as_str())
+        .lines()
+        .map(String::from)
+        .collect()
+}
+
+/// Lines of a words.txt are like
+/// ```plaintext
+/// 1234 hello I/am/a/test/you/can
+/// 1231 correrify my/posibily/orrer
+/// ```
+/// We want to parse a line into a vector
+/// which elements represents each words,
+/// include first number.
+fn parse_words(file: &str) -> Vec<Vec<String>> {
+    parse_line(file)
+        .iter()
+        .map(|word| {
+            word.as_str()
+                .split(&['/', ' '])
+                .map(String::from)
+                .collect::<Vec<_>>()
+        })
+        .collect()
+}
+
+/// Binary-search first. If the word is NOT in the dictionary,
+/// we will find the word with the minimum distance.
+fn correrify<'a>(word: &'a str, dict: &'a Vec<String>) -> &'a str {
+    if let Ok(_) = dict.binary_search(&word.to_string()) {
+        return word;
+    }
+
+    let mut temp_min = (usize::MAX, "");
+    for check_word in dict.iter() {
+        let distance = levenshtein(word, check_word.as_str());
+
+        if distance <= 1 {
+            return check_word;
+        }
+
+        if distance < temp_min.0 {
+            temp_min = (distance, check_word);
+        }
+    }
+
+    temp_min.1
+}
+
+/// The words's shape is just like
+/// ```
+/// [
+///     ["1324", "word1", "word2", "word3"],
+///     ["1325", "word1", "word2", "word3"],
+/// ]
+/// ```
+/// , and We can assert
+/// ```
+/// assert!(word_line[0].len() == 4);
+/// assert!(word_line[0].chars().all(|c| c.is_numeric()));
+/// ```
+/// We just skip the first word(4 digits number)
+/// and correrify the rest of words.
+fn select_word_correrify(words: &Vec<Vec<String>>, dict: &Vec<String>) -> Vec<Vec<String>> {
+    words
+        .iter()
+        .map(|word_line| {
+            assert!(word_line[0].len() == 4);
+            assert!(word_line[0].chars().all(|c| c.is_numeric()));
+
+            vec![word_line[0].clone()]
+                .into_iter()
+                .chain(
+                    word_line
+                        .iter()
+                        .skip(1)
+                        .map(|word| correrify(word, dict).to_string())
+                        .collect::<Vec<_>>(),
+                )
+                .collect::<Vec<String>>()
+        })
+        .collect()
+}
+
+/// this function just write to the correrified_words.txt
+/// with same format as words.txt
+fn write_correrified_words(words: &Vec<Vec<String>>) {
+    let mut file = File::create("correrified_words.txt").expect("Unable to create file");
+    for word_line in words.iter() {
+        writeln!(
+            file,
+            "{} {} {}",
+            word_line[0],
+            word_line[1],
+            word_line[2..].join("/")
+        )
+        .expect("unable to write to file");
+    }
+}
+
+fn main() {
+    let mut dict = parse_line("vocabulary.txt");
+    dict.sort_unstable(); // sort the dictionary to accelerate the search
+    let words = parse_words("words.txt");
+    let word_correrified = select_word_correrify(&words, &dict);
+
+    write_correrified_words(&word_correrified);
+}
--- a/vocabulary.txt
+++ b/vocabulary.txt
--- a/words.txt
+++ b/words.txt