diff options
author | DongHun Kwak <dh0128.kwak@samsung.com> | 2023-11-07 16:24:54 +0900 |
---|---|---|
committer | DongHun Kwak <dh0128.kwak@samsung.com> | 2023-11-07 16:24:54 +0900 |
commit | fd2cc80a31a5177d4f2cced99ed2e7235af9d6ee (patch) | |
tree | f36ea011da95dfef55020143cf57e42d75c4a0a3 | |
download | rust-difflib-upstream.tar.gz rust-difflib-upstream.tar.bz2 rust-difflib-upstream.zip |
Import difflib 0.4.0upstream/0.4.0upstream
-rw-r--r-- | Cargo.toml | 26 | ||||
-rw-r--r-- | Cargo.toml.orig | 19 | ||||
-rw-r--r-- | examples/example.rs | 63 | ||||
-rw-r--r-- | src/differ.rs | 340 | ||||
-rw-r--r-- | src/lib.rs | 169 | ||||
-rw-r--r-- | src/sequencematcher.rs | 346 | ||||
-rw-r--r-- | src/utils.rs | 47 | ||||
-rw-r--r-- | tests/tests.rs | 194 |
8 files changed, 1204 insertions, 0 deletions
diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..29a094f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,26 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g. crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "difflib" +version = "0.4.0" +authors = ["Dima Kudosh <dimakudosh@gmail.com>"] +include = ["**/*.rs", "Cargo.toml"] +description = "Port of Python's difflib library to Rust." +homepage = "https://github.com/DimaKudosh/difflib" +documentation = "https://github.com/DimaKudosh/difflib/wiki" +keywords = ["difflib", "text", "diff"] +license = "MIT" +repository = "https://github.com/DimaKudosh/difflib" + +[[test]] +name = "tests" diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..2edf6df --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,19 @@ +[package] +name = "difflib" +version = "0.4.0" +authors = ["Dima Kudosh <dimakudosh@gmail.com>"] +description = "Port of Python's difflib library to Rust." +documentation = "https://github.com/DimaKudosh/difflib/wiki" +homepage = "https://github.com/DimaKudosh/difflib" +repository = "https://github.com/DimaKudosh/difflib" +keywords = ["difflib", "text", "diff"] +license = "MIT" +include = [ + "**/*.rs", + "Cargo.toml", +] + + +[[test]] +name = "tests" + diff --git a/examples/example.rs b/examples/example.rs new file mode 100644 index 0000000..3f06d0a --- /dev/null +++ b/examples/example.rs @@ -0,0 +1,63 @@ +extern crate difflib; + +use difflib::differ::Differ; +use difflib::sequencematcher::SequenceMatcher; + +fn main() { + // unified_diff + let first_text = "one two three four".split(" ").collect::<Vec<&str>>(); + let second_text = "zero one tree four".split(" ").collect::<Vec<&str>>(); + let diff = difflib::unified_diff( + &first_text, + &second_text, + "Original", + "Current", + "2005-01-26 23:30:50", + "2010-04-02 10:20:52", + 3, + ); + for line in &diff { + println!("{:?}", line); + } + + //context_diff + let diff = difflib::context_diff( + &first_text, + &second_text, + "Original", + "Current", + "2005-01-26 23:30:50", + "2010-04-02 10:20:52", + 3, + ); + for line in &diff { + println!("{:?}", line); + } + + //get_close_matches + let words = vec!["ape", "apple", "peach", "puppy"]; + let result = difflib::get_close_matches("appel", words, 3, 0.6); + println!("{:?}", result); + + //Differ examples + let differ = Differ::new(); + let diff = differ.compare(&first_text, &second_text); + for line in &diff { + println!("{:?}", line); + } + + //SequenceMatcher examples + let mut matcher = SequenceMatcher::new("one two three four", "zero one tree four"); + let m = matcher.find_longest_match(0, 18, 0, 18); + println!("{:?}", m); + let all_matches = matcher.get_matching_blocks(); + println!("{:?}", all_matches); + let opcode = matcher.get_opcodes(); + println!("{:?}", opcode); + let grouped_opcodes = matcher.get_grouped_opcodes(2); + println!("{:?}", grouped_opcodes); + let ratio = matcher.ratio(); + println!("{:?}", ratio); + matcher.set_seqs("aaaaa", "aaaab"); + println!("{:?}", matcher.ratio()); +} diff --git a/src/differ.rs b/src/differ.rs new file mode 100644 index 0000000..5caef9a --- /dev/null +++ b/src/differ.rs @@ -0,0 +1,340 @@ +use sequencematcher::SequenceMatcher; +use std::cmp; +use utils::{count_leading, str_with_similar_chars}; + +#[derive(Default)] +pub struct Differ { + pub line_junk: Option<fn(&&str) -> bool>, + pub char_junk: Option<fn(&char) -> bool>, +} + +impl Differ { + pub fn new() -> Differ { + Differ { + line_junk: None, + char_junk: None, + } + } + + pub fn compare(&self, first_sequence: &[&str], second_sequence: &[&str]) -> Vec<String> { + let mut matcher = SequenceMatcher::new(first_sequence, second_sequence); + matcher.set_is_junk(self.line_junk); + let mut res = Vec::new(); + for opcode in matcher.get_opcodes() { + let mut gen = Vec::new(); + match opcode.tag.as_ref() { + "replace" => { + gen = self.fancy_replace( + first_sequence, + opcode.first_start, + opcode.first_end, + second_sequence, + opcode.second_start, + opcode.second_end, + ) + } + "delete" => { + gen = self.dump("-", first_sequence, opcode.first_start, opcode.first_end) + } + "insert" => { + gen = self.dump("+", second_sequence, opcode.second_start, opcode.second_end) + } + "equal" => { + gen = self.dump(" ", first_sequence, opcode.first_start, opcode.first_end) + } + _ => {} + } + for i in gen { + res.push(i); + } + } + res + } + + fn dump(&self, tag: &str, sequence: &[&str], start: usize, end: usize) -> Vec<String> { + let mut res = Vec::new(); + for i in start..end { + if let Some(s) = sequence.get(i) { + res.push(format!("{} {}", tag, s)) + } + } + res + } + + fn plain_replace( + &self, + first_sequence: &[&str], + first_start: usize, + first_end: usize, + second_sequence: &[&str], + second_start: usize, + second_end: usize, + ) -> Vec<String> { + if !(first_start < first_end && second_start < second_end) { + return Vec::new(); + } + let (mut first, second) = if second_end - second_start < first_end - first_start { + ( + self.dump("+", second_sequence, second_start, second_end), + self.dump("-", first_sequence, first_start, first_end), + ) + } else { + ( + self.dump("-", first_sequence, first_start, first_end), + self.dump("+", second_sequence, second_start, second_end), + ) + }; + for s in second { + first.push(s); + } + first + } + + fn fancy_replace( + &self, + first_sequence: &[&str], + first_start: usize, + first_end: usize, + second_sequence: &[&str], + second_start: usize, + second_end: usize, + ) -> Vec<String> { + let mut res = Vec::new(); + let (mut best_ratio, cutoff) = (0.74, 0.75); + let (mut best_i, mut best_j) = (0, 0); + let mut eqi: Option<usize> = None; + let mut eqj: Option<usize> = None; + for (j, second_sequence_str) in second_sequence + .iter() + .enumerate() + .take(second_end) + .skip(second_start) + { + for (i, first_sequence_str) in first_sequence + .iter() + .enumerate() + .take(second_end) + .skip(second_start) + { + if first_sequence_str == second_sequence_str { + if eqi.is_none() { + eqi = Some(i); + eqj = Some(j); + } + continue; + } + let (first_sequence_chars, second_sequence_chars) = ( + first_sequence_str.chars().collect::<Vec<char>>(), + second_sequence_str.chars().collect::<Vec<char>>(), + ); + let mut cruncher = + SequenceMatcher::new(&first_sequence_chars, &second_sequence_chars); + cruncher.set_is_junk(self.char_junk); + if cruncher.ratio() > best_ratio { + best_ratio = cruncher.ratio(); + best_i = i; + best_j = j; + } + } + } + if best_ratio < cutoff { + if eqi.is_none() { + res.extend( + self.plain_replace( + first_sequence, + first_start, + first_end, + second_sequence, + second_start, + second_end, + ).iter() + .cloned(), + ); + return res; + } + best_i = eqi.unwrap(); + best_j = eqj.unwrap(); + } else { + eqi = None; + } + res.extend( + self.fancy_helper( + first_sequence, + first_start, + best_i, + second_sequence, + second_start, + best_j, + ).iter() + .cloned(), + ); + let first_element = &first_sequence[best_i]; + let second_element = &second_sequence[best_j]; + if eqi.is_none() { + let (mut first_tag, mut second_tag) = (String::new(), String::new()); + let first_element_chars: Vec<char> = first_element.chars().collect(); + let second_element_chars: Vec<char> = second_element.chars().collect(); + let mut cruncher = SequenceMatcher::new(&first_element_chars, &second_element_chars); + cruncher.set_is_junk(self.char_junk); + for opcode in &cruncher.get_opcodes() { + let (first_length, second_length) = ( + opcode.first_end - opcode.first_start, + opcode.second_end - opcode.second_start, + ); + match opcode.tag.as_ref() { + "replace" => { + first_tag.push_str(&str_with_similar_chars('^', first_length)); + second_tag.push_str(&str_with_similar_chars('^', second_length)); + } + "delete" => { + first_tag.push_str(&str_with_similar_chars('-', first_length)); + } + "insert" => { + second_tag.push_str(&str_with_similar_chars('+', second_length)); + } + "equal" => { + first_tag.push_str(&str_with_similar_chars(' ', first_length)); + second_tag.push_str(&str_with_similar_chars(' ', second_length)); + } + _ => {} + } + } + res.extend( + self.qformat(&first_element, &second_element, &first_tag, &second_tag) + .iter() + .cloned(), + ); + } else { + let mut s = String::from(" "); + s.push_str(&first_element); + res.push(s); + } + res.extend( + self.fancy_helper( + first_sequence, + best_i + 1, + first_end, + second_sequence, + best_j + 1, + second_end, + ).iter() + .cloned(), + ); + res + } + + fn fancy_helper( + &self, + first_sequence: &[&str], + first_start: usize, + first_end: usize, + second_sequence: &[&str], + second_start: usize, + second_end: usize, + ) -> Vec<String> { + let mut res = Vec::new(); + if first_start < first_end { + if second_start < second_end { + res = self.fancy_replace( + first_sequence, + first_start, + first_end, + second_sequence, + second_start, + second_end, + ); + } else { + res = self.dump("-", first_sequence, first_start, first_end); + } + } else if second_start < second_end { + res = self.dump("+", second_sequence, second_start, second_end); + } + res + } + + fn qformat( + &self, + first_line: &str, + second_line: &str, + first_tags: &str, + second_tags: &str, + ) -> Vec<String> { + let mut res = Vec::new(); + let mut first_tags = first_tags; + let mut second_tags = second_tags; + let mut common = cmp::min( + count_leading(first_line, '\t'), + count_leading(second_line, '\t'), + ); + common = cmp::min(common, count_leading(first_tags.split_at(common).0, ' ')); + common = cmp::min(common, count_leading(first_tags.split_at(common).0, ' ')); + first_tags = first_tags.split_at(common).1.trim_right(); + second_tags = second_tags.split_at(common).1.trim_right(); + let mut s = format!("- {}", first_line); + res.push(s); + if first_tags != "" { + s = format!("? {}{}\n", str_with_similar_chars('\t', common), first_tags); + res.push(s); + } + s = format!("+ {}", second_line); + res.push(s); + if second_tags != "" { + s = format!( + "? {}{}\n", + str_with_similar_chars('\t', common), + second_tags + ); + res.push(s); + } + res + } + + pub fn restore(delta: &[String], which: usize) -> Vec<String> { + if !(which == 1 || which == 2) { + panic!("Second parameter must be 1 or 2"); + } + let mut res = Vec::new(); + let tag = if which == 1 { "- " } else { "+ " }.to_string(); + let prefixes = vec![tag, " ".to_string()]; + for line in delta { + for prefix in &prefixes { + if line.starts_with(prefix) { + res.push(line.split_at(2).1.to_string()); + } + } + } + res + } +} + +#[test] +fn test_fancy_replace() { + let differ = Differ::new(); + let result = differ + .fancy_replace(&vec!["abcDefghiJkl\n"], 0, 1, &vec!["abcdefGhijkl\n"], 0, 1) + .join(""); + assert_eq!( + result, + "- abcDefghiJkl\n? ^ ^ ^\n+ abcdefGhijkl\n? ^ ^ ^\n" + ); +} + +#[test] +fn test_qformat() { + let differ = Differ::new(); + let result = differ.qformat( + "\tabcDefghiJkl\n", + "\tabcdefGhijkl\n", + " ^ ^ ^ ", + " ^ ^ ^ ", + ); + assert_eq!( + result, + vec![ + "- \tabcDefghiJkl\n", + "? \t ^ ^ ^\n", + "+ \tabcdefGhijkl\n", + "? \t ^ ^ ^\n", + ] + ); +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..ca6b1cc --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,169 @@ +pub mod differ; +pub mod sequencematcher; +mod utils; + +use sequencematcher::{Sequence, SequenceMatcher}; +use std::collections::HashMap; +use std::fmt::Display; +use utils::{format_range_context, format_range_unified}; + +pub fn get_close_matches<'a>( + word: &str, + possibilities: Vec<&'a str>, + n: usize, + cutoff: f32, +) -> Vec<&'a str> { + if !(0.0 <= cutoff && cutoff <= 1.0) { + panic!("Cutoff must be greater than 0.0 and lower than 1.0"); + } + let mut res: Vec<(f32, &str)> = Vec::new(); + let mut matcher = SequenceMatcher::new("", word); + for i in &possibilities { + matcher.set_first_seq(i); + let ratio = matcher.ratio(); + if ratio >= cutoff { + res.push((ratio, i)); + } + } + res.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap()); + res.truncate(n); + res.iter().map(|x| x.1).collect() +} + +pub fn unified_diff<T: Sequence + Display>( + first_sequence: &[T], + second_sequence: &[T], + from_file: &str, + to_file: &str, + from_file_date: &str, + to_file_date: &str, + n: usize, +) -> Vec<String> { + let mut res = Vec::new(); + let lineterm = '\n'; + let mut started = false; + let mut matcher = SequenceMatcher::new(first_sequence, second_sequence); + for group in &matcher.get_grouped_opcodes(n) { + if !started { + started = true; + let from_date = format!("\t{}", from_file_date); + let to_date = format!("\t{}", to_file_date); + res.push(format!("--- {}{}{}", from_file, from_date, lineterm)); + res.push(format!("+++ {}{}{}", to_file, to_date, lineterm)); + } + let (first, last) = (group.first().unwrap(), group.last().unwrap()); + let file1_range = format_range_unified(first.first_start, last.first_end); + let file2_range = format_range_unified(first.second_start, last.second_end); + res.push(format!( + "@@ -{} +{} @@{}", + file1_range, file2_range, lineterm + )); + for code in group { + if code.tag == "equal" { + for item in first_sequence + .iter() + .take(code.first_end) + .skip(code.first_start) + { + res.push(format!(" {}", item)); + } + continue; + } + if code.tag == "replace" || code.tag == "delete" { + for item in first_sequence + .iter() + .take(code.first_end) + .skip(code.first_start) + { + res.push(format!("-{}", item)); + } + } + if code.tag == "replace" || code.tag == "insert" { + for item in second_sequence + .iter() + .take(code.second_end) + .skip(code.second_start) + { + res.push(format!("+{}", item)); + } + } + } + } + res +} + +pub fn context_diff<T: Sequence + Display>( + first_sequence: &[T], + second_sequence: &[T], + from_file: &str, + to_file: &str, + from_file_date: &str, + to_file_date: &str, + n: usize, +) -> Vec<String> { + let mut res = Vec::new(); + let lineterm = '\n'; + let mut prefix: HashMap<String, String> = HashMap::new(); + prefix.insert(String::from("insert"), String::from("+ ")); + prefix.insert(String::from("delete"), String::from("- ")); + prefix.insert(String::from("replace"), String::from("! ")); + prefix.insert(String::from("equal"), String::from(" ")); + let mut started = false; + let mut matcher = SequenceMatcher::new(first_sequence, second_sequence); + for group in &matcher.get_grouped_opcodes(n) { + if !started { + started = true; + let from_date = format!("\t{}", from_file_date); + let to_date = format!("\t{}", to_file_date); + res.push(format!("*** {}{}{}", from_file, from_date, lineterm)); + res.push(format!("--- {}{}{}", to_file, to_date, lineterm)); + } + let (first, last) = (group.first().unwrap(), group.last().unwrap()); + res.push(format!("***************{}", lineterm)); + let file1_range = format_range_context(first.first_start, last.first_end); + res.push(format!("*** {} ****{}", file1_range, lineterm)); + let mut any = false; + for opcode in group { + if opcode.tag == "replace" || opcode.tag == "delete" { + any = true; + break; + } + } + if any { + for opcode in group { + if opcode.tag != "insert" { + for item in first_sequence + .iter() + .take(opcode.first_end) + .skip(opcode.first_start) + { + res.push(format!("{}{}", &prefix[&opcode.tag], item)); + } + } + } + } + let file2_range = format_range_context(first.second_start, last.second_end); + res.push(format!("--- {} ----{}", file2_range, lineterm)); + any = false; + for opcode in group { + if opcode.tag == "replace" || opcode.tag == "insert" { + any = true; + break; + } + } + if any { + for opcode in group { + if opcode.tag != "delete" { + for item in second_sequence + .iter() + .take(opcode.second_end) + .skip(opcode.second_start) + { + res.push(format!("{}{}", prefix[&opcode.tag], item)); + } + } + } + } + } + res +} diff --git a/src/sequencematcher.rs b/src/sequencematcher.rs new file mode 100644 index 0000000..4157808 --- /dev/null +++ b/src/sequencematcher.rs @@ -0,0 +1,346 @@ +use std::cmp::{max, min}; +use std::collections::HashMap; +use std::hash::Hash; +use utils::calculate_ratio; + +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)] +pub struct Match { + pub first_start: usize, + pub second_start: usize, + pub size: usize, +} + +impl Match { + fn new(first_start: usize, second_start: usize, size: usize) -> Match { + Match { + first_start, + second_start, + size, + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct Opcode { + pub tag: String, + pub first_start: usize, + pub first_end: usize, + pub second_start: usize, + pub second_end: usize, +} + +impl Opcode { + fn new( + tag: String, + first_start: usize, + first_end: usize, + second_start: usize, + second_end: usize, + ) -> Opcode { + Opcode { + tag, + first_start, + first_end, + second_start, + second_end, + } + } +} + +pub trait Sequence: Eq + Hash {} +impl<T: Eq + Hash> Sequence for T {} + +pub struct SequenceMatcher<'a, T: 'a + Sequence> { + first_sequence: &'a [T], + second_sequence: &'a [T], + matching_blocks: Option<Vec<Match>>, + opcodes: Option<Vec<Opcode>>, + is_junk: Option<fn(&T) -> bool>, + second_sequence_elements: HashMap<&'a T, Vec<usize>>, +} + +impl<'a, T: Sequence> SequenceMatcher<'a, T> { + pub fn new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T> + where + S: AsRef<[T]> + ?Sized, + { + let mut matcher = SequenceMatcher { + first_sequence: first_sequence.as_ref(), + second_sequence: second_sequence.as_ref(), + matching_blocks: None, + opcodes: None, + is_junk: None, + second_sequence_elements: HashMap::new(), + }; + matcher.set_seqs(first_sequence, second_sequence); + matcher + } + + pub fn set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>) { + self.is_junk = is_junk; + self.matching_blocks = None; + self.opcodes = None; + self.chain_second_seq(); + } + + pub fn set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S) + where + S: AsRef<[T]> + ?Sized, + { + self.set_first_seq(first_sequence); + self.set_second_seq(second_sequence); + } + + pub fn set_first_seq<S>(&mut self, sequence: &'a S) + where + S: AsRef<[T]> + ?Sized, + { + self.first_sequence = sequence.as_ref(); + self.matching_blocks = None; + self.opcodes = None; + } + + pub fn set_second_seq<S>(&mut self, sequence: &'a S) + where + S: AsRef<[T]> + ?Sized, + { + self.second_sequence = sequence.as_ref(); + self.matching_blocks = None; + self.opcodes = None; + self.chain_second_seq(); + } + + fn chain_second_seq(&mut self) { + let second_sequence = self.second_sequence; + let mut second_sequence_elements = HashMap::new(); + for (i, item) in second_sequence.iter().enumerate() { + let mut counter = second_sequence_elements + .entry(item) + .or_insert_with(Vec::new); + counter.push(i); + } + if let Some(junk_func) = self.is_junk { + second_sequence_elements = second_sequence_elements + .into_iter() + .filter(|&(element, _)| !junk_func(element)) + .collect(); + } + // Filter out popular elements + let len = second_sequence.len(); + if len >= 200 { + let test_len = (len as f32 / 100.0).floor() as usize + 1; + second_sequence_elements = second_sequence_elements + .into_iter() + .filter(|&(_, ref indexes)| indexes.len() > test_len) + .collect(); + } + self.second_sequence_elements = second_sequence_elements; + } + + pub fn find_longest_match( + &self, + first_start: usize, + first_end: usize, + second_start: usize, + second_end: usize, + ) -> Match { + let first_sequence = &self.first_sequence; + let second_sequence = &self.second_sequence; + let second_sequence_elements = &self.second_sequence_elements; + let (mut best_i, mut best_j, mut best_size) = (first_start, second_start, 0); + let mut j2len: HashMap<usize, usize> = HashMap::new(); + for (i, item) in first_sequence + .iter() + .enumerate() + .take(first_end) + .skip(first_start) + { + let mut new_j2len: HashMap<usize, usize> = HashMap::new(); + if let Some(indexes) = second_sequence_elements.get(item) { + for j in indexes { + let j = *j; + if j < second_start { + continue; + }; + if j >= second_end { + break; + }; + let mut size = 0; + if j > 0 { + if let Some(k) = j2len.get(&(j - 1)) { + size = *k; + } + } + size += 1; + new_j2len.insert(j, size); + if size > best_size { + best_i = i + 1 - size; + best_j = j + 1 - size; + best_size = size; + } + } + } + j2len = new_j2len; + } + for _ in 0..2 { + while best_i > first_start + && best_j > second_start + && first_sequence.get(best_i - 1) == second_sequence.get(best_j - 1) + { + best_i -= 1; + best_j -= 1; + best_size += 1; + } + while best_i + best_size < first_end + && best_j + best_size < second_end + && first_sequence.get(best_i + best_size) == second_sequence.get(best_j + best_size) + { + best_size += 1; + } + } + Match::new(best_i, best_j, best_size) + } + + pub fn get_matching_blocks(&mut self) -> Vec<Match> { + if self.matching_blocks.as_ref().is_some() { + return self.matching_blocks.as_ref().unwrap().clone(); + } + let (first_length, second_length) = (self.first_sequence.len(), self.second_sequence.len()); + let mut matches = Vec::new(); + let mut queue = vec![(0, first_length, 0, second_length)]; + while !queue.is_empty() { + let (first_start, first_end, second_start, second_end) = queue.pop().unwrap(); + let m = self.find_longest_match(first_start, first_end, second_start, second_end); + match m.size { + 0 => {} + _ => { + if first_start < m.first_start && second_start < m.second_start { + queue.push((first_start, m.first_start, second_start, m.second_start)); + } + if m.first_start + m.size < first_end && m.second_start + m.size < second_end { + queue.push(( + m.first_start + m.size, + first_end, + m.second_start + m.size, + second_end, + )); + } + matches.push(m); + } + } + } + matches.sort_by(|a, b| a.cmp(b)); + let (mut first_start, mut second_start, mut size) = (0, 0, 0); + let mut non_adjacent = Vec::new(); + for m in &matches { + if first_start + size == m.first_start && second_start + size == m.second_start { + size += m.size + } else { + if size != 0 { + non_adjacent.push(Match::new(first_start, second_start, size)); + } + first_start = m.first_start; + second_start = m.second_start; + size = m.size; + } + } + if size != 0 { + non_adjacent.push(Match::new(first_start, second_start, size)); + } + non_adjacent.push(Match::new(first_length, second_length, 0)); + self.matching_blocks = Some(non_adjacent); + self.matching_blocks.as_ref().unwrap().clone() + } + + pub fn get_opcodes(&mut self) -> Vec<Opcode> { + if self.opcodes.as_ref().is_some() { + return self.opcodes.as_ref().unwrap().clone(); + } + let mut opcodes = Vec::new(); + let (mut i, mut j) = (0, 0); + for m in self.get_matching_blocks() { + let mut tag = String::new(); + if i < m.first_start && j < m.second_start { + tag = String::from("replace"); + } else if i < m.first_start { + tag = String::from("delete"); + } else if j < m.second_start { + tag = String::from("insert"); + } + if !tag.is_empty() { + opcodes.push(Opcode::new(tag, i, m.first_start, j, m.second_start)); + } + i = m.first_start + m.size; + j = m.second_start + m.size; + if m.size != 0 { + opcodes.push(Opcode::new( + String::from("equal"), + m.first_start, + i, + m.second_start, + j, + )); + } + } + self.opcodes = Some(opcodes); + self.opcodes.as_ref().unwrap().clone() + } + + pub fn get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>> { + let mut res = Vec::new(); + let mut codes = self.get_opcodes(); + if codes.is_empty() { + codes.push(Opcode::new("equal".to_string(), 0, 1, 0, 1)); + } + + if codes.first().unwrap().tag == "equal" { + let opcode = codes.first_mut().unwrap(); + opcode.first_start = max(opcode.first_start, opcode.first_end.saturating_sub(n)); + opcode.second_start = max(opcode.second_start, opcode.second_end.saturating_sub(n)); + } + if codes.last().unwrap().tag == "equal" { + let opcode = codes.last_mut().unwrap(); + opcode.first_end = min(opcode.first_start + n, opcode.first_end); + opcode.second_end = min(opcode.second_start + n, opcode.second_end); + } + let nn = n + n; + let mut group = Vec::new(); + for code in &codes { + let (mut first_start, mut second_start) = (code.first_start, code.second_start); + if code.tag == "equal" && code.first_end - code.first_start > nn { + group.push(Opcode::new( + code.tag.clone(), + code.first_start, + min(code.first_end, code.first_start + n), + code.second_start, + min(code.second_end, code.second_start + n), + )); + res.push(group.clone()); + group.clear(); + first_start = max(first_start, code.first_end.saturating_sub(n)); + second_start = max(second_start, code.second_end.saturating_sub(n)); + } + group.push(Opcode::new( + code.tag.clone(), + first_start, + code.first_end, + second_start, + code.second_end, + )); + } + if !(group.len() == 1 && group.first().unwrap().tag == "equal") || group.is_empty() { + res.push(group.clone()); + } + res + } + + pub fn ratio(&mut self) -> f32 { + let matches = self.get_matching_blocks() + .iter() + .fold(0, |res, &m| res + m.size); + calculate_ratio( + matches, + self.first_sequence.len() + self.second_sequence.len(), + ) + } +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..b711bf8 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,47 @@ +pub fn calculate_ratio(matches: usize, length: usize) -> f32 { + if length != 0 { + return 2.0 * matches as f32 / length as f32; + } + 1.0 +} + +pub fn str_with_similar_chars(c: char, length: usize) -> String { + let mut s = String::new(); + for _ in 0..length { + s.push_str(&c.to_string()); + } + s +} + +pub fn count_leading(line: &str, c: char) -> usize { + let (mut i, n) = (0, line.len()); + let line: Vec<char> = line.chars().collect(); + while (i < n) && line[i] == c { + i += 1; + } + i +} + +pub fn format_range_unified(start: usize, end: usize) -> String { + let mut beginning = start + 1; + let length = end - start; + if length == 1 { + return beginning.to_string(); + } + if length == 0 { + beginning -= 1; + } + format!("{},{}", beginning, length) +} + +pub fn format_range_context(start: usize, end: usize) -> String { + let mut beginning = start + 1; + let length = end - start; + if length == 0 { + beginning -= 1 + } + if length <= 1 { + return beginning.to_string(); + } + format!("{},{}", beginning, beginning + length - 1) +} diff --git a/tests/tests.rs b/tests/tests.rs new file mode 100644 index 0000000..8f49de9 --- /dev/null +++ b/tests/tests.rs @@ -0,0 +1,194 @@ +extern crate difflib; + +use difflib::differ::Differ; +use difflib::sequencematcher::{Match, Opcode, SequenceMatcher}; + +#[test] +fn test_longest_match() { + let matcher = SequenceMatcher::new(" abcd", "abcd abcd"); + let m = matcher.find_longest_match(0, 5, 0, 9); + assert_eq!(m.first_start, 0); + assert_eq!(m.second_start, 4); + assert_eq!(m.size, 5); +} + +#[test] +fn test_all_matches() { + let mut matcher = SequenceMatcher::new("abxcd", "abcd"); + let result = matcher.get_matching_blocks(); + let mut expected_result = Vec::new(); + expected_result.push(Match { + first_start: 0, + second_start: 0, + size: 2, + }); + expected_result.push(Match { + first_start: 3, + second_start: 2, + size: 2, + }); + expected_result.push(Match { + first_start: 5, + second_start: 4, + size: 0, + }); + assert_eq!(result, expected_result); +} + +#[test] +fn test_get_opcodes() { + let mut matcher = SequenceMatcher::new("qabxcd", "abycdf"); + let result = matcher.get_opcodes(); + let mut expected_result = Vec::new(); + expected_result.push(Opcode { + tag: "delete".to_string(), + first_start: 0, + first_end: 1, + second_start: 0, + second_end: 0, + }); + expected_result.push(Opcode { + tag: "equal".to_string(), + first_start: 1, + first_end: 3, + second_start: 0, + second_end: 2, + }); + expected_result.push(Opcode { + tag: "replace".to_string(), + first_start: 3, + first_end: 4, + second_start: 2, + second_end: 3, + }); + expected_result.push(Opcode { + tag: "equal".to_string(), + first_start: 4, + first_end: 6, + second_start: 3, + second_end: 5, + }); + expected_result.push(Opcode { + tag: "insert".to_string(), + first_start: 6, + first_end: 6, + second_start: 5, + second_end: 6, + }); + assert_eq!(result, expected_result); +} + +#[test] +fn test_ratio() { + let mut matcher = SequenceMatcher::new("abcd", "bcde"); + assert_eq!(matcher.ratio(), 0.75); +} + +#[test] +fn test_get_close_matches() { + let words = vec!["ape", "apple", "peach", "puppy"]; + let result = difflib::get_close_matches("appel", words, 3, 0.6); + assert_eq!(result, vec!["apple", "ape"]); +} + +#[test] +fn test_differ_compare() { + let first_text = vec!["one\n", "two\n", "three\n"]; + let second_text = vec!["ore\n", "tree\n", "emu\n"]; + let differ = Differ::new(); + let result = differ.compare(&first_text, &second_text).join(""); + assert_eq!( + result, + "- one\n? ^\n+ ore\n? ^\n- two\n- three\n? -\n+ tree\n+ emu\n" + ); +} + +fn is_junk_char(ch: &char) -> bool { + if *ch == ' ' || *ch == '\t' { + return true; + } + false +} + +#[test] +fn test_differ_compare_with_func() { + let first_text = vec!["one\n", "two\n", "three\n"]; + let second_text = vec!["ore\n", "tree\n", "emu\n"]; + let mut differ = Differ::new(); + differ.char_junk = Some(is_junk_char); + let result = differ.compare(&first_text, &second_text).join(""); + assert_eq!( + result, + "- one\n? ^\n+ ore\n? ^\n- two\n- three\n? -\n+ tree\n+ emu\n" + ); +} + +#[test] +fn test_differ_restore() { + let first_text = vec!["one\n", " two\n", "three\n"]; + let second_text = vec!["ore\n", "tree\n", "emu\n"]; + let differ = Differ::new(); + let diff = differ.compare(&first_text, &second_text); + assert_eq!(first_text, Differ::restore(&diff, 1)); + assert_eq!(second_text, Differ::restore(&diff, 2)); +} + +#[test] +fn test_unified_diff() { + let first_text = "one two three four".split(" ").collect::<Vec<&str>>(); + let second_text = "zero one tree four".split(" ").collect::<Vec<&str>>(); + let result = difflib::unified_diff( + &first_text, + &second_text, + "Original", + "Current", + "2005-01-26 23:30:50", + "2010-04-02 10:20:52", + 3, + ).join(""); + assert_eq!( + result, + "--- Original\t2005-01-26 23:30:50\n+++ Current\t2010-04-02 10:20:52\n@@ -1,4 \ + +1,4 @@\n+zero one-two-three+tree four" + ); +} + +#[test] +fn test_context_diff() { + let first_text = "one two three four".split(" ").collect::<Vec<&str>>(); + let second_text = "zero one tree four".split(" ").collect::<Vec<&str>>(); + let result = difflib::context_diff( + &first_text, + &second_text, + "Original", + "Current", + "2005-01-26 23:30:50", + "2010-04-02 10:20:52", + 3, + ).join(""); + assert_eq!( + result, + "*** Original\t2005-01-26 23:30:50\n--- Current\t2010-04-02 \ + 10:20:52\n***************\n*** 1,4 ****\n one! two! three four--- 1,4 ----\n+ \ + zero one! tree four" + ); +} + +#[test] +fn test_integer_slice() { + let s1 = vec![1, 2, 3, 4, 5]; + let s2 = vec![5, 4, 3, 2, 1]; + let result = SequenceMatcher::new(&s1, &s2).get_matching_blocks(); + let mut expected_result = Vec::new(); + expected_result.push(Match { + first_start: 0, + second_start: 4, + size: 1, + }); + expected_result.push(Match { + first_start: 5, + second_start: 5, + size: 0, + }); + assert_eq!(result, expected_result); +} |