[initial] first edition

2025-11-20 16:04:09 +08:00 · 2025-11-20 16:04:09 +08:00 · ff73cc57f0
commit ff73cc57f0
7 changed files with 9571 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 /target
 corr*
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,16 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 4
 [[package]]
 name = "course_3"
 version = "0.1.0"
 dependencies = [
 "levenshtein",
 ]
 [[package]]
 name = "levenshtein"
 version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,7 @@
 [package]
 name = "course_3"
 version = "0.1.0"
 edition = "2024"
 [dependencies]
 levenshtein = "1.0.5"
--- a/README.md
+++ b/README.md
@ -0,0 +1,18 @@
 # Word Correction
 ## YinMo19
 A simple word correction program written in rust, while it is from a c course extra homework problem.
 It uses an algorithm to find the most similar word to the input word, which called Levenshtein distance.
 The parse of the input file is the most thing in the program do.
 The two big acceptance of the program is the levenshtein distance algorithm realization and the sort/binary_search of the word in dict.
 You can just run 
 ```bash
 cargo build --release
 time ./target/release/word_correction
 ```
 to test. In MacBook Air M2, My test result is 
 ```bash
 > time ./target/release/word_correction 
 ./target/release/word_correction  0.13s user 0.07s system 97% cpu 0.213 total
 ```
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,118 @@
 #![doc = include_str!("../README.md")]
 use levenshtein::levenshtein;
 use std::fs::{File, read_to_string};
 use std::io::Write;
 /// open a specified file and return a vector of strings
 /// where each element is a line.
 fn parse_line(file: &str) -> Vec<String> {
    read_to_string(file)
        .expect(format!("No {} found", file).as_str())
        .lines()
        .map(String::from)
        .collect()
 }
 /// Lines of a words.txt are like
 /// ```plaintext
 /// 1234 hello I/am/a/test/you/can
 /// 1231 correrify my/posibily/orrer
 /// ```
 /// We want to parse a line into a vector
 /// which elements represents each words,
 /// include first number.
 fn parse_words(file: &str) -> Vec<Vec<String>> {
    parse_line(file)
        .iter()
        .map(|word| {
            word.as_str()
                .split(&['/', ' '])
                .map(String::from)
                .collect::<Vec<_>>()
        })
        .collect()
 }
 /// Binary-search first. If the word is NOT in the dictionary,
 /// we will find the word with the minimum distance.
 fn correrify<'a>(word: &'a str, dict: &'a Vec<String>) -> &'a str {
    if let Ok(_) = dict.binary_search(&word.to_string()) {
        return word;
    }
    let mut temp_min = (usize::MAX, "");
    for check_word in dict.iter() {
        let distance = levenshtein(word, check_word.as_str());
        if distance <= 1 {
            return check_word;
        }
        if distance < temp_min.0 {
            temp_min = (distance, check_word);
        }
    }
    temp_min.1
 }
 /// The words's shape is just like
 /// ```
 /// [
 ///     ["1324", "word1", "word2", "word3"],
 ///     ["1325", "word1", "word2", "word3"],
 /// ]
 /// ```
 /// , and We can assert
 /// ```
 /// assert!(word_line[0].len() == 4);
 /// assert!(word_line[0].chars().all(|c| c.is_numeric()));
 /// ```
 /// We just skip the first word(4 digits number)
 /// and correrify the rest of words.
 fn select_word_correrify(words: &Vec<Vec<String>>, dict: &Vec<String>) -> Vec<Vec<String>> {
    words
        .iter()
        .map(|word_line| {
            assert!(word_line[0].len() == 4);
            assert!(word_line[0].chars().all(|c| c.is_numeric()));
            vec![word_line[0].clone()]
                .into_iter()
                .chain(
                    word_line
                        .iter()
                        .skip(1)
                        .map(|word| correrify(word, dict).to_string())
                        .collect::<Vec<_>>(),
                )
                .collect::<Vec<String>>()
        })
        .collect()
 }
 /// this function just write to the correrified_words.txt
 /// with same format as words.txt
 fn write_correrified_words(words: &Vec<Vec<String>>) {
    let mut file = File::create("correrified_words.txt").expect("Unable to create file");
    for word_line in words.iter() {
        writeln!(
            file,
            "{} {} {}",
            word_line[0],
            word_line[1],
            word_line[2..].join("/")
        )
        .expect("unable to write to file");
    }
 }
 fn main() {
    let mut dict = parse_line("vocabulary.txt");
    dict.sort_unstable(); // sort the dictionary to accelerate the search
    let words = parse_words("words.txt");
    let word_correrified = select_word_correrify(&words, &dict);
    write_correrified_words(&word_correrified);
 }
--- a/vocabulary.txt
+++ b/vocabulary.txt
--- a/words.txt
+++ b/words.txt