summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.cargo_vcs_info.json6
-rw-r--r--.clippy.toml1
-rw-r--r--.github/FUNDING.yml1
-rw-r--r--.github/workflows/ci.yml81
-rw-r--r--.gitignore2
-rw-r--r--Cargo.toml36
-rw-r--r--Cargo.toml.orig21
-rw-r--r--LICENSE-APACHE176
-rw-r--r--LICENSE-MIT23
-rw-r--r--README.md64
-rw-r--r--benches/bench.rs15
-rw-r--r--benches/document1.txt230
-rw-r--r--benches/document2.txt188
-rw-r--r--src/find.rs232
-rw-r--r--src/lib.rs935
-rw-r--r--src/range.rs141
-rw-r--r--src/tests.rs591
-rw-r--r--tests/test.rs52
18 files changed, 2795 insertions, 0 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
new file mode 100644
index 0000000..77399d2
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,6 @@
+{
+ "git": {
+ "sha1": "00360c4b4b2735c5e1cf21e18af8aca3d28eb5ef"
+ },
+ "path_in_vcs": ""
+} \ No newline at end of file
diff --git a/.clippy.toml b/.clippy.toml
new file mode 100644
index 0000000..0a54853
--- /dev/null
+++ b/.clippy.toml
@@ -0,0 +1 @@
+msrv = "1.36.0"
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..7507077
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1 @@
+github: dtolnay
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..540c84a
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,81 @@
+name: CI
+
+on:
+ push:
+ pull_request:
+ schedule: [cron: "40 1 * * *"]
+
+permissions:
+ contents: read
+
+env:
+ RUSTFLAGS: -Dwarnings
+
+jobs:
+ pre_ci:
+ uses: dtolnay/.github/.github/workflows/pre_ci.yml@master
+
+ test:
+ name: Rust ${{matrix.rust}}
+ needs: pre_ci
+ if: needs.pre_ci.outputs.continue
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ rust: [nightly, beta, stable]
+ timeout-minutes: 45
+ steps:
+ - uses: actions/checkout@v3
+ - uses: dtolnay/rust-toolchain@master
+ with:
+ toolchain: ${{matrix.rust}}
+ - run: cargo test
+ - run: cargo test --benches --release
+ if: matrix.rust == 'nightly'
+
+ msrv:
+ name: Rust 1.36.0
+ needs: pre_ci
+ if: needs.pre_ci.outputs.continue
+ runs-on: ubuntu-latest
+ timeout-minutes: 45
+ steps:
+ - uses: actions/checkout@v3
+ - uses: dtolnay/rust-toolchain@1.36.0
+ - run: cargo check
+
+ fuzz:
+ name: Fuzz
+ needs: pre_ci
+ if: needs.pre_ci.outputs.continue
+ runs-on: ubuntu-latest
+ timeout-minutes: 45
+ steps:
+ - uses: actions/checkout@v3
+ - uses: dtolnay/rust-toolchain@nightly
+ - uses: dtolnay/install@cargo-fuzz
+ - run: cargo fuzz check
+
+ clippy:
+ name: Clippy
+ runs-on: ubuntu-latest
+ if: github.event_name != 'pull_request'
+ timeout-minutes: 45
+ steps:
+ - uses: actions/checkout@v3
+ - uses: dtolnay/rust-toolchain@clippy
+ - run: cargo clippy --tests --benches -- -Dclippy::all -Dclippy::pedantic
+
+ miri:
+ name: Miri
+ needs: pre_ci
+ if: needs.pre_ci.outputs.continue
+ runs-on: ubuntu-latest
+ timeout-minutes: 45
+ steps:
+ - uses: actions/checkout@v3
+ - uses: dtolnay/rust-toolchain@miri
+ - run: cargo miri test
+ env:
+ MIRIFLAGS: -Zmiri-strict-provenance
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..96ef6c0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..3fdb995
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,36 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2018"
+rust-version = "1.36"
+name = "dissimilar"
+version = "1.0.6"
+authors = ["David Tolnay <dtolnay@gmail.com>"]
+description = "Diff library with semantic cleanup, based on Google's diff-match-patch"
+documentation = "https://docs.rs/dissimilar"
+readme = "README.md"
+keywords = ["diff"]
+categories = [
+ "algorithms",
+ "text-processing",
+]
+license = "Apache-2.0"
+repository = "https://github.com/dtolnay/dissimilar"
+
+[package.metadata.docs.rs]
+targets = ["x86_64-unknown-linux-gnu"]
+
+[lib]
+doc-scrape-examples = false
+
+[dev-dependencies.once_cell]
+version = "1"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644
index 0000000..f69c79e
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,21 @@
+[package]
+name = "dissimilar"
+version = "1.0.6"
+authors = ["David Tolnay <dtolnay@gmail.com>"]
+categories = ["algorithms", "text-processing"]
+description = "Diff library with semantic cleanup, based on Google's diff-match-patch"
+documentation = "https://docs.rs/dissimilar"
+edition = "2018"
+keywords = ["diff"]
+license = "Apache-2.0" # See the readme. The whole crate is Apache licensed. Some parts are additionally MIT licensed.
+repository = "https://github.com/dtolnay/dissimilar"
+rust-version = "1.36"
+
+[lib]
+doc-scrape-examples = false
+
+[dev-dependencies]
+once_cell = "1"
+
+[package.metadata.docs.rs]
+targets = ["x86_64-unknown-linux-gnu"]
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 0000000..1b5ec8b
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,176 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..31aa793
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,23 @@
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..82ce669
--- /dev/null
+++ b/README.md
@@ -0,0 +1,64 @@
+Dissimilar: diff library with semantic cleanup
+==============================================
+
+[<img alt="github" src="https://img.shields.io/badge/github-dtolnay/dissimilar-8da0cb?style=for-the-badge&labelColor=555555&logo=github" height="20">](https://github.com/dtolnay/dissimilar)
+[<img alt="crates.io" src="https://img.shields.io/crates/v/dissimilar.svg?style=for-the-badge&color=fc8d62&logo=rust" height="20">](https://crates.io/crates/dissimilar)
+[<img alt="docs.rs" src="https://img.shields.io/badge/docs.rs-dissimilar-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs" height="20">](https://docs.rs/dissimilar)
+[<img alt="build status" src="https://img.shields.io/github/actions/workflow/status/dtolnay/dissimilar/ci.yml?branch=master&style=for-the-badge" height="20">](https://github.com/dtolnay/dissimilar/actions?query=branch%3Amaster)
+
+This library is a port of the Diff component of [Diff Match Patch] to Rust. The
+diff implementation is based on [Myers' diff algorithm] but includes some
+[semantic cleanups] to increase human readability by factoring out commonalities
+which are likely to be coincidental.
+
+Diff Match Patch was originally built in 2006 to power Google Docs.
+
+[Diff Match Patch]: https://github.com/google/diff-match-patch
+[Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
+[semantic cleanups]: https://neil.fraser.name/writing/diff/
+
+```toml
+[dependencies]
+dissimilar = "1.0"
+```
+
+*Compiler support: requires rustc 1.36+*
+
+<br>
+
+## Interface
+
+Here is the entire API of the Rust implementation. It operates on borrowed
+strings and the return value of the diff algorithm is a vector of chunks
+pointing into slices of those input strings.
+
+```rust
+pub enum Chunk<'a> {
+ Equal(&'a str),
+ Delete(&'a str),
+ Insert(&'a str),
+}
+
+pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
+```
+
+<br>
+
+## License
+
+The diff algorithm in this crate was ported to Rust using the Java and C++
+implementations found at <https://github.com/google/diff-match-patch> as
+reference, and is made available here under the <a href="LICENSE-APACHE">Apache
+License, Version 2.0</a> matching the license of the original. This entire
+project, including some parts unmodified from upstream and the Rust-specific
+modifications introduced in the course of porting the implementation, are
+distributed under this Apache license.
+
+Intellectual property that is unique to the Rust implementation is additionally
+made available to you dually under the <a href="LICENSE-MIT">MIT license</a>, if
+you prefer. This applies to all design choices and implementation choices not
+found in the upstream repo.
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in this crate by you, as defined in the Apache-2.0 license, shall
+be dual Apache and MIT licensed, without any additional terms or conditions.
diff --git a/benches/bench.rs b/benches/bench.rs
new file mode 100644
index 0000000..72f2633
--- /dev/null
+++ b/benches/bench.rs
@@ -0,0 +1,15 @@
+#![feature(test)]
+
+extern crate test;
+
+use dissimilar::diff;
+use std::{fs, io};
+use test::Bencher;
+
+#[bench]
+fn bench(b: &mut Bencher) -> io::Result<()> {
+ let document1 = fs::read_to_string("benches/document1.txt")?;
+ let document2 = fs::read_to_string("benches/document2.txt")?;
+ b.iter(|| diff(&document1, &document2));
+ Ok(())
+}
diff --git a/benches/document1.txt b/benches/document1.txt
new file mode 100644
index 0000000..54b438f
--- /dev/null
+++ b/benches/document1.txt
@@ -0,0 +1,230 @@
+This is a '''list of newspapers published by [[Journal Register Company]]'''.
+
+The company owns daily and weekly newspapers, other print media properties and newspaper-affiliated local Websites in the [[U.S.]] states of [[Connecticut]], [[Michigan]], [[New York]], [[Ohio]] and [[Pennsylvania]], organized in six geographic "clusters":<ref>[http://www.journalregister.com/newspapers.html Journal Register Company: Our Newspapers], accessed February 10, 2008.</ref>
+
+== Capital-Saratoga ==
+Three dailies, associated weeklies and [[pennysaver]]s in greater [[Albany, New York]]; also [http://www.capitalcentral.com capitalcentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com].
+
+* ''The Oneida Daily Dispatch'' {{WS|oneidadispatch.com}} of [[Oneida, New York]]
+* ''[[The Record (Troy)|The Record]]'' {{WS|troyrecord.com}} of [[Troy, New York]]
+* ''[[The Saratogian]]'' {{WS|saratogian.com}} of [[Saratoga Springs, New York]]
+* Weeklies:
+** ''Community News'' {{WS|cnweekly.com}} weekly of [[Clifton Park, New York]]
+** ''Rome Observer'' of [[Rome, New York]]
+** ''Life & Times of Utica'' of [[Utica, New York]]
+
+== Connecticut ==
+Five dailies, associated weeklies and [[pennysaver]]s in the state of [[Connecticut]]; also [http://www.ctcentral.com CTcentral.com], [http://www.ctcarsandtrucks.com CTCarsAndTrucks.com] and [http://www.jobsinct.com JobsInCT.com].
+
+* ''The Middletown Press'' {{WS|middletownpress.com}} of [[Middletown, Connecticut|Middletown]]
+* ''[[New Haven Register]]'' {{WS|newhavenregister.com}} of [[New Haven, Connecticut|New Haven]]
+* ''The Register Citizen'' {{WS|registercitizen.com}} of [[Torrington, Connecticut|Torrington]]
+
+* [[New Haven Register#Competitors|Elm City Newspapers]] {{WS|ctcentral.com}}
+** ''The Advertiser'' of [[East Haven, Connecticut|East Haven]]
+** ''Hamden Chronicle'' of [[Hamden, Connecticut|Hamden]]
+** ''Milford Weekly'' of [[Milford, Connecticut|Milford]]
+** ''The Orange Bulletin'' of [[Orange, Connecticut|Orange]]
+** ''The Post'' of [[North Haven, Connecticut|North Haven]]
+** ''Shelton Weekly'' of [[Shelton, Connecticut|Shelton]]
+** ''The Stratford Bard'' of [[Stratford, Connecticut|Stratford]]
+** ''Wallingford Voice'' of [[Wallingford, Connecticut|Wallingford]]
+** ''West Haven News'' of [[West Haven, Connecticut|West Haven]]
+* Housatonic Publications
+** ''The New Milford Times'' {{WS|newmilfordtimes.com}} of [[New Milford, Connecticut|New Milford]]
+** ''The Brookfield Journal'' of [[Brookfield, Connecticut|Brookfield]]
+** ''The Kent Good Times Dispatch'' of [[Kent, Connecticut|Kent]]
+** ''The Bethel Beacon'' of [[Bethel, Connecticut|Bethel]]
+** ''The Litchfield Enquirer'' of [[Litchfield, Connecticut|Litchfield]]
+** ''Litchfield County Times'' of [[Litchfield, Connecticut|Litchfield]]
+* Imprint Newspapers {{WS|imprintnewspapers.com}}
+** ''West Hartford News'' of [[West Hartford, Connecticut|West Hartford]]
+** ''Windsor Journal'' of [[Windsor, Connecticut|Windsor]]
+** ''Windsor Locks Journal'' of [[Windsor Locks, Connecticut|Windsor Locks]]
+** ''Avon Post'' of [[Avon, Connecticut|Avon]]
+** ''Farmington Post'' of [[Farmington, Connecticut|Farmington]]
+** ''Simsbury Post'' of [[Simsbury, Connecticut|Simsbury]]
+** ''Tri-Town Post'' of [[Burlington, Connecticut|Burlington]], [[Canton, Connecticut|Canton]] and [[Harwinton, Connecticut|Harwinton]]
+* Minuteman Publications
+** ''[[Fairfield Minuteman]]'' of [[Fairfield, Connecticut|Fairfield]]
+** ''The Westport Minuteman'' {{WS|westportminuteman.com}} of [[Westport, Connecticut|Westport]]
+* Shoreline Newspapers weeklies:
+** ''Branford Review'' of [[Branford, Connecticut|Branford]]
+** ''Clinton Recorder'' of [[Clinton, Connecticut|Clinton]]
+** ''The Dolphin'' of [[Naval Submarine Base New London]] in [[New London, Connecticut|New London]]
+** ''Main Street News'' {{WS|ctmainstreetnews.com}} of [[Essex, Connecticut|Essex]]
+** ''Pictorial Gazette'' of [[Old Saybrook, Connecticut|Old Saybrook]]
+** ''Regional Express'' of [[Colchester, Connecticut|Colchester]]
+** ''Regional Standard'' of [[Colchester, Connecticut|Colchester]]
+** ''Shoreline Times'' {{WS|shorelinetimes.com}} of [[Guilford, Connecticut|Guilford]]
+** ''Shore View East'' of [[Madison, Connecticut|Madison]]
+** ''Shore View West'' of [[Guilford, Connecticut|Guilford]]
+* Other weeklies:
+** ''Registro'' {{WS|registroct.com}} of [[New Haven, Connecticut|New Haven]]
+** ''Thomaston Express'' {{WS|thomastownexpress.com}} of [[Thomaston, Connecticut|Thomaston]]
+** ''Foothills Traders'' {{WS|foothillstrader.com}} of Torrington, Bristol, Canton
+
+== Michigan ==
+Four dailies, associated weeklies and [[pennysaver]]s in the state of [[Michigan]]; also [http://www.micentralhomes.com MIcentralhomes.com] and [http://www.micentralautos.com MIcentralautos.com]
+* ''[[Oakland Press]]'' {{WS|theoaklandpress.com}} of [[Oakland, Michigan|Oakland]]
+* ''Daily Tribune'' {{WS|dailytribune.com}} of [[Royal Oak, Michigan|Royal Oak]]
+* ''Macomb Daily'' {{WS|macombdaily.com}} of [[Mt. Clemens, Michigan|Mt. Clemens]]
+* ''[[Morning Sun]]'' {{WS|themorningsun.com}} of [[Mount Pleasant, Michigan|Mount Pleasant]]
+* Heritage Newspapers {{WS|heritage.com}}
+** ''Belleville View''
+** ''Ile Camera''
+** ''Monroe Guardian''
+** ''Ypsilanti Courier''
+** ''News-Herald''
+** ''Press & Guide''
+** ''Chelsea Standard & Dexter Leader''
+** ''Manchester Enterprise''
+** ''Milan News-Leader''
+** ''Saline Reporter''
+* Independent Newspapers {{WS|sourcenewspapers.com}}
+** ''Advisor''
+** ''Source''
+* Morning Star {{WS|morningstarpublishing.com}}
+** ''Alma Reminder''
+** ''Alpena Star''
+** ''Antrim County News''
+** ''Carson City Reminder''
+** ''The Leader & Kalkaskian''
+** ''Ogemaw/Oscoda County Star''
+** ''Petoskey/Charlevoix Star''
+** ''Presque Isle Star''
+** ''Preview Community Weekly''
+** ''Roscommon County Star''
+** ''St. Johns Reminder''
+** ''Straits Area Star''
+** ''The (Edmore) Advertiser''
+* Voice Newspapers {{WS|voicenews.com}}
+** ''Armada Times''
+** ''Bay Voice''
+** ''Blue Water Voice''
+** ''Downriver Voice''
+** ''Macomb Township Voice''
+** ''North Macomb Voice''
+** ''Weekend Voice''
+** ''Suburban Lifestyles'' {{WS|suburbanlifestyles.com}}
+
+== Mid-Hudson ==
+One daily, associated magazines in the [[Hudson River Valley]] of [[New York]]; also [http://www.midhudsoncentral.com MidHudsonCentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com].
+
+* ''[[Daily Freeman]]'' {{WS|dailyfreeman.com}} of [[Kingston, New York]]
+
+== Ohio ==
+Two dailies, associated magazines and three shared Websites, all in the state of [[Ohio]]: [http://www.allaroundcleveland.com AllAroundCleveland.com], [http://www.allaroundclevelandcars.com AllAroundClevelandCars.com] and [http://www.allaroundclevelandjobs.com AllAroundClevelandJobs.com].
+
+* ''[[The News-Herald (Ohio)|The News-Herald]]'' {{WS|news-herald.com}} of [[Willoughby, Ohio|Willoughby]]
+* ''[[The Morning Journal]]'' {{WS|morningjournal.com}} of [[Lorain, Ohio|Lorain]]
+
+== Philadelphia area ==
+Seven dailies and associated weeklies and magazines in [[Pennsylvania]] and [[New Jersey]], and associated Websites: [http://www.allaroundphilly.com AllAroundPhilly.com], [http://www.jobsinnj.com JobsInNJ.com], [http://www.jobsinpa.com JobsInPA.com], and [http://www.phillycarsearch.com PhillyCarSearch.com].
+
+* ''The Daily Local'' {{WS|dailylocal.com}} of [[West Chester, Pennsylvania|West Chester]]
+* ''[[Delaware County Daily and Sunday Times]] {{WS|delcotimes.com}} of Primos
+* ''[[The Mercury (Pennsylvania)|The Mercury]]'' {{WS|pottstownmercury.com}} of [[Pottstown, Pennsylvania|Pottstown]]
+* ''The Phoenix'' {{WS|phoenixvillenews.com}} of [[Phoenixville, Pennsylvania|Phoenixville]]
+* ''[[The Reporter (Lansdale)|The Reporter]]'' {{WS|thereporteronline.com}} of [[Lansdale, Pennsylvania|Lansdale]]
+* ''The Times Herald'' {{WS|timesherald.com}} of [[Norristown, Pennsylvania|Norristown]]
+* ''[[The Trentonian]]'' {{WS|trentonian.com}} of [[Trenton, New Jersey]]
+
+* Weeklies
+** ''El Latino Expreso'' of [[Trenton, New Jersey]]
+** ''La Voz'' of [[Norristown, Pennsylvania]]
+** ''The Village News'' of [[Downingtown, Pennsylvania]]
+** ''The Times Record'' of [[Kennett Square, Pennsylvania]]
+** ''The Tri-County Record'' {{WS|tricountyrecord.com}} of [[Morgantown, Pennsylvania]]
+** ''News of Delaware County'' {{WS|newsofdelawarecounty.com}}of [[Havertown, Pennsylvania]]
+** ''Main Line Times'' {{WS|mainlinetimes.com}}of [[Ardmore, Pennsylvania]]
+** ''Penny Pincher'' of [[Pottstown, Pennsylvania]]
+** ''Town Talk'' {{WS|towntalknews.com}} of [[Ridley, Pennsylvania]]
+* Chesapeake Publishing {{WS|pa8newsgroup.com}}
+** ''Solanco Sun Ledger'' of [[Quarryville, Pennsylvania]]
+** ''Columbia Ledger'' of [[Columbia, Pennsylvania]]
+** ''Coatesville Ledger'' of [[Downingtown, Pennsylvania]]
+** ''Parkesburg Post Ledger'' of [[Quarryville, Pennsylvania]]
+** ''Downingtown Ledger'' of [[Downingtown, Pennsylvania]]
+** ''The Kennett Paper'' of [[Kennett Square, Pennsylvania]]
+** ''Avon Grove Sun'' of [[West Grove, Pennsylvania]]
+** ''Oxford Tribune'' of [[Oxford, Pennsylvania]]
+** ''Elizabethtown Chronicle'' of [[Elizabethtown, Pennsylvania]]
+** ''Donegal Ledger'' of [[Donegal, Pennsylvania]]
+** ''Chadds Ford Post'' of [[Chadds Ford, Pennsylvania]]
+** ''The Central Record'' of [[Medford, New Jersey]]
+** ''Maple Shade Progress'' of [[Maple Shade, New Jersey]]
+* Intercounty Newspapers {{WS|buckslocalnews.com}}
+** ''The Review'' of Roxborough, Pennsylvania
+** ''The Recorder'' of [[Conshohocken, Pennsylvania]]
+** ''The Leader'' of [[Mount Airy, Pennsylvania|Mount Airy]] and West Oak Lake, Pennsylvania
+** ''The Pennington Post'' of [[Pennington, New Jersey]]
+** ''The Bristol Pilot'' of [[Bristol, Pennsylvania]]
+** ''Yardley News'' of [[Yardley, Pennsylvania]]
+** ''New Hope Gazette'' of [[New Hope, Pennsylvania]]
+** ''Doylestown Patriot'' of [[Doylestown, Pennsylvania]]
+** ''Newtown Advance'' of [[Newtown, Pennsylvania]]
+** ''The Plain Dealer'' of [[Williamstown, New Jersey]]
+** ''News Report'' of [[Sewell, New Jersey]]
+** ''Record Breeze'' of [[Berlin, New Jersey]]
+** ''Newsweekly'' of [[Moorestown, New Jersey]]
+** ''Haddon Herald'' of [[Haddonfield, New Jersey]]
+** ''New Egypt Press'' of [[New Egypt, New Jersey]]
+** ''Community News'' of [[Pemberton, New Jersey]]
+** ''Plymouth Meeting Journal'' of [[Plymouth Meeting, Pennsylvania]]
+** ''Lafayette Hill Journal'' of [[Lafayette Hill, Pennsylvania]]
+* Montgomery Newspapers {{WS|montgomerynews.com}}
+** ''Ambler Gazette'' of [[Ambler, Pennsylvania]]
+** ''Central Bucks Life'' of [[Bucks County, Pennsylvania]]
+** ''The Colonial'' of [[Plymouth Meeting, Pennsylvania]]
+** ''Glenside News'' of [[Glenside, Pennsylvania]]
+** ''The Globe'' of [[Lower Moreland Township, Pennsylvania]]
+** ''Main Line Life'' of [[Ardmore, Pennsylvania]]
+** ''Montgomery Life'' of [[Fort Washington, Pennsylvania]]
+** ''North Penn Life'' of [[Lansdale, Pennsylvania]]
+** ''Perkasie News Herald'' of [[Perkasie, Pennsylvania]]
+** ''Public Spirit'' of [[Hatboro, Pennsylvania]]
+** ''Souderton Independent'' of [[Souderton, Pennsylvania]]
+** ''Springfield Sun'' of [[Springfield, Pennsylvania]]
+** ''Spring-Ford Reporter'' of [[Royersford, Pennsylvania]]
+** ''Times Chronicle'' of [[Jenkintown, Pennsylvania]]
+** ''Valley Item'' of [[Perkiomenville, Pennsylvania]]
+** ''Willow Grove Guide'' of [[Willow Grove, Pennsylvania]]
+* News Gleaner Publications (closed December 2008) {{WS|newsgleaner.com}}
+** ''Life Newspapers'' of [[Philadelphia, Pennsylvania]]
+* Suburban Publications
+** ''The Suburban & Wayne Times'' {{WS|waynesuburban.com}} of [[Wayne, Pennsylvania]]
+** ''The Suburban Advertiser'' of [[Exton, Pennsylvania]]
+** ''The King of Prussia Courier'' of [[King of Prussia, Pennsylvania]]
+* Press Newspapers {{WS|countypressonline.com}}
+** ''County Press'' of [[Newtown Square, Pennsylvania]]
+** ''Garnet Valley Press'' of [[Glen Mills, Pennsylvania]]
+** ''Haverford Press'' of [[Newtown Square, Pennsylvania]] (closed January 2009)
+** ''Hometown Press'' of [[Glen Mills, Pennsylvania]] (closed January 2009)
+** ''Media Press'' of [[Newtown Square, Pennsylvania]] (closed January 2009)
+** ''Springfield Press'' of [[Springfield, Pennsylvania]]
+* Berks-Mont Newspapers {{WS|berksmontnews.com}}
+** ''The Boyertown Area Times'' of [[Boyertown, Pennsylvania]]
+** ''The Kutztown Area Patriot'' of [[Kutztown, Pennsylvania]]
+** ''The Hamburg Area Item'' of [[Hamburg, Pennsylvania]]
+** ''The Southern Berks News'' of [[Exeter Township, Berks County, Pennsylvania]]
+** ''The Free Press'' of [[Quakertown, Pennsylvania]]
+** ''The Saucon News'' of [[Quakertown, Pennsylvania]]
+** ''Westside Weekly'' of [[Reading, Pennsylvania]]
+
+* Magazines
+** ''Bucks Co. Town & Country Living''
+** ''Chester Co. Town & Country Living''
+** ''Montomgery Co. Town & Country Living''
+** ''Garden State Town & Country Living''
+** ''Montgomery Homes''
+** ''Philadelphia Golfer''
+** ''Parents Express''
+** ''Art Matters''
+
+{{JRC}}
+
+==References==
+<references />
+
+[[Category:Journal Register publications|*]]
diff --git a/benches/document2.txt b/benches/document2.txt
new file mode 100644
index 0000000..8f25a80
--- /dev/null
+++ b/benches/document2.txt
@@ -0,0 +1,188 @@
+This is a '''list of newspapers published by [[Journal Register Company]]'''.
+
+The company owns daily and weekly newspapers, other print media properties and newspaper-affiliated local Websites in the [[U.S.]] states of [[Connecticut]], [[Michigan]], [[New York]], [[Ohio]], [[Pennsylvania]] and [[New Jersey]], organized in six geographic "clusters":<ref>[http://www.journalregister.com/publications.html Journal Register Company: Our Publications], accessed April 21, 2010.</ref>
+
+== Capital-Saratoga ==
+Three dailies, associated weeklies and [[pennysaver]]s in greater [[Albany, New York]]; also [http://www.capitalcentral.com capitalcentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com].
+
+* ''The Oneida Daily Dispatch'' {{WS|oneidadispatch.com}} of [[Oneida, New York]]
+* ''[[The Record (Troy)|The Record]]'' {{WS|troyrecord.com}} of [[Troy, New York]]
+* ''[[The Saratogian]]'' {{WS|saratogian.com}} of [[Saratoga Springs, New York]]
+* Weeklies:
+** ''Community News'' {{WS|cnweekly.com}} weekly of [[Clifton Park, New York]]
+** ''Rome Observer'' {{WS|romeobserver.com}} of [[Rome, New York]]
+** ''WG Life '' {{WS|saratogian.com/wglife/}} of [[Wilton, New York]]
+** ''Ballston Spa Life '' {{WS|saratogian.com/bspalife}} of [[Ballston Spa, New York]]
+** ''Greenbush Life'' {{WS|troyrecord.com/greenbush}} of [[Troy, New York]]
+** ''Latham Life'' {{WS|troyrecord.com/latham}} of [[Latham, New York]]
+** ''River Life'' {{WS|troyrecord.com/river}} of [[Troy, New York]]
+
+== Connecticut ==
+Three dailies, associated weeklies and [[pennysaver]]s in the state of [[Connecticut]]; also [http://www.ctcentral.com CTcentral.com], [http://www.ctcarsandtrucks.com CTCarsAndTrucks.com] and [http://www.jobsinct.com JobsInCT.com].
+
+* ''The Middletown Press'' {{WS|middletownpress.com}} of [[Middletown, Connecticut|Middletown]]
+* ''[[New Haven Register]]'' {{WS|newhavenregister.com}} of [[New Haven, Connecticut|New Haven]]
+* ''The Register Citizen'' {{WS|registercitizen.com}} of [[Torrington, Connecticut|Torrington]]
+
+* Housatonic Publications
+** ''The Housatonic Times'' {{WS|housatonictimes.com}} of [[New Milford, Connecticut|New Milford]]
+** ''Litchfield County Times'' {{WS|countytimes.com}} of [[Litchfield, Connecticut|Litchfield]]
+
+* Minuteman Publications
+** ''[[Fairfield Minuteman]]'' {{WS|fairfieldminuteman.com}}of [[Fairfield, Connecticut|Fairfield]]
+** ''The Westport Minuteman'' {{WS|westportminuteman.com}} of [[Westport, Connecticut|Westport]]
+
+* Shoreline Newspapers
+** ''The Dolphin'' {{WS|dolphin-news.com}} of [[Naval Submarine Base New London]] in [[New London, Connecticut|New London]]
+** ''Shoreline Times'' {{WS|shorelinetimes.com}} of [[Guilford, Connecticut|Guilford]]
+
+* Foothills Media Group {{WS|foothillsmediagroup.com}}
+** ''Thomaston Express'' {{WS|thomastonexpress.com}} of [[Thomaston, Connecticut|Thomaston]]
+** ''Good News About Torrington'' {{WS|goodnewsabouttorrington.com}} of [[Torrington, Connecticut|Torrington]]
+** ''Granby News'' {{WS|foothillsmediagroup.com/granby}} of [[Granby, Connecticut|Granby]]
+** ''Canton News'' {{WS|foothillsmediagroup.com/canton}} of [[Canton, Connecticut|Canton]]
+** ''Avon News'' {{WS|foothillsmediagroup.com/avon}} of [[Avon, Connecticut|Avon]]
+** ''Simsbury News'' {{WS|foothillsmediagroup.com/simsbury}} of [[Simsbury, Connecticut|Simsbury]]
+** ''Litchfield News'' {{WS|foothillsmediagroup.com/litchfield}} of [[Litchfield, Connecticut|Litchfield]]
+** ''Foothills Trader'' {{WS|foothillstrader.com}} of Torrington, Bristol, Canton
+
+* Other weeklies
+** ''The Milford-Orange Bulletin'' {{WS|ctbulletin.com}} of [[Orange, Connecticut|Orange]]
+** ''The Post-Chronicle'' {{WS|ctpostchronicle.com}} of [[North Haven, Connecticut|North Haven]]
+** ''West Hartford News'' {{WS|westhartfordnews.com}} of [[West Hartford, Connecticut|West Hartford]]
+
+* Magazines
+** ''The Connecticut Bride'' {{WS|connecticutmag.com}}
+** ''Connecticut Magazine'' {{WS|theconnecticutbride.com}}
+** ''Passport Magazine'' {{WS|passport-mag.com}}
+
+== Michigan ==
+Four dailies, associated weeklies and [[pennysaver]]s in the state of [[Michigan]]; also [http://www.micentralhomes.com MIcentralhomes.com] and [http://www.micentralautos.com MIcentralautos.com]
+* ''[[Oakland Press]]'' {{WS|theoaklandpress.com}} of [[Oakland, Michigan|Oakland]]
+* ''Daily Tribune'' {{WS|dailytribune.com}} of [[Royal Oak, Michigan|Royal Oak]]
+* ''Macomb Daily'' {{WS|macombdaily.com}} of [[Mt. Clemens, Michigan|Mt. Clemens]]
+* ''[[Morning Sun]]'' {{WS|themorningsun.com}} of [[Mount Pleasant, Michigan|Mount Pleasant]]
+
+* Heritage Newspapers {{WS|heritage.com}}
+** ''Belleville View'' {{WS|bellevilleview.com}}
+** ''Ile Camera'' {{WS|thenewsherald.com/ile_camera}}
+** ''Monroe Guardian'' {{WS|monreguardian.com}}
+** ''Ypsilanti Courier'' {{WS|ypsilanticourier.com}}
+** ''News-Herald'' {{WS|thenewsherald.com}}
+** ''Press & Guide'' {{WS|pressandguide.com}}
+** ''Chelsea Standard & Dexter Leader'' {{WS|chelseastandard.com}}
+** ''Manchester Enterprise'' {{WS|manchesterguardian.com}}
+** ''Milan News-Leader'' {{WS|milannews.com}}
+** ''Saline Reporter'' {{WS|salinereporter.com}}
+* Independent Newspapers
+** ''Advisor'' {{WS|sourcenewspapers.com}}
+** ''Source'' {{WS|sourcenewspapers.com}}
+* Morning Star {{WS|morningstarpublishing.com}}
+** ''The Leader & Kalkaskian'' {{WS|leaderandkalkaskian.com}}
+** ''Grand Traverse Insider'' {{WS|grandtraverseinsider.com}}
+** ''Alma Reminder''
+** ''Alpena Star''
+** ''Ogemaw/Oscoda County Star''
+** ''Presque Isle Star''
+** ''St. Johns Reminder''
+
+* Voice Newspapers {{WS|voicenews.com}}
+** ''Armada Times''
+** ''Bay Voice''
+** ''Blue Water Voice''
+** ''Downriver Voice''
+** ''Macomb Township Voice''
+** ''North Macomb Voice''
+** ''Weekend Voice''
+
+== Mid-Hudson ==
+One daily, associated magazines in the [[Hudson River Valley]] of [[New York]]; also [http://www.midhudsoncentral.com MidHudsonCentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com].
+
+* ''[[Daily Freeman]]'' {{WS|dailyfreeman.com}} of [[Kingston, New York]]
+* ''Las Noticias'' {{WS|lasnoticiasny.com}} of [[Kingston, New York]]
+
+== Ohio ==
+Two dailies, associated magazines and three shared Websites, all in the state of [[Ohio]]: [http://www.allaroundcleveland.com AllAroundCleveland.com], [http://www.allaroundclevelandcars.com AllAroundClevelandCars.com] and [http://www.allaroundclevelandjobs.com AllAroundClevelandJobs.com].
+
+* ''[[The News-Herald (Ohio)|The News-Herald]]'' {{WS|news-herald.com}} of [[Willoughby, Ohio|Willoughby]]
+* ''[[The Morning Journal]]'' {{WS|morningjournal.com}} of [[Lorain, Ohio|Lorain]]
+* ''El Latino Expreso'' {{WS|lorainlatino.com}} of [[Lorain, Ohio|Lorain]]
+
+== Philadelphia area ==
+Seven dailies and associated weeklies and magazines in [[Pennsylvania]] and [[New Jersey]], and associated Websites: [http://www.allaroundphilly.com AllAroundPhilly.com], [http://www.jobsinnj.com JobsInNJ.com], [http://www.jobsinpa.com JobsInPA.com], and [http://www.phillycarsearch.com PhillyCarSearch.com].
+
+* ''[[The Daily Local News]]'' {{WS|dailylocal.com}} of [[West Chester, Pennsylvania|West Chester]]
+* ''[[Delaware County Daily and Sunday Times]] {{WS|delcotimes.com}} of Primos [[Upper Darby Township, Pennsylvania]]
+* ''[[The Mercury (Pennsylvania)|The Mercury]]'' {{WS|pottstownmercury.com}} of [[Pottstown, Pennsylvania|Pottstown]]
+* ''[[The Reporter (Lansdale)|The Reporter]]'' {{WS|thereporteronline.com}} of [[Lansdale, Pennsylvania|Lansdale]]
+* ''The Times Herald'' {{WS|timesherald.com}} of [[Norristown, Pennsylvania|Norristown]]
+* ''[[The Trentonian]]'' {{WS|trentonian.com}} of [[Trenton, New Jersey]]
+
+* Weeklies
+* ''The Phoenix'' {{WS|phoenixvillenews.com}} of [[Phoenixville, Pennsylvania]]
+** ''El Latino Expreso'' {{WS|njexpreso.com}} of [[Trenton, New Jersey]]
+** ''La Voz'' {{WS|lavozpa.com}} of [[Norristown, Pennsylvania]]
+** ''The Tri County Record'' {{WS|tricountyrecord.com}} of [[Morgantown, Pennsylvania]]
+** ''Penny Pincher'' {{WS|pennypincherpa.com}}of [[Pottstown, Pennsylvania]]
+
+* Chesapeake Publishing {{WS|southernchestercountyweeklies.com}}
+** ''The Kennett Paper'' {{WS|kennettpaper.com}} of [[Kennett Square, Pennsylvania]]
+** ''Avon Grove Sun'' {{WS|avongrovesun.com}} of [[West Grove, Pennsylvania]]
+** ''The Central Record'' {{WS|medfordcentralrecord.com}} of [[Medford, New Jersey]]
+** ''Maple Shade Progress'' {{WS|mapleshadeprogress.com}} of [[Maple Shade, New Jersey]]
+
+* Intercounty Newspapers {{WS|buckslocalnews.com}} {{WS|southjerseylocalnews.com}}
+** ''The Pennington Post'' {{WS|penningtonpost.com}} of [[Pennington, New Jersey]]
+** ''The Bristol Pilot'' {{WS|bristolpilot.com}} of [[Bristol, Pennsylvania]]
+** ''Yardley News'' {{WS|yardleynews.com}} of [[Yardley, Pennsylvania]]
+** ''Advance of Bucks County'' {{WS|advanceofbucks.com}} of [[Newtown, Pennsylvania]]
+** ''Record Breeze'' {{WS|recordbreeze.com}} of [[Berlin, New Jersey]]
+** ''Community News'' {{WS|sjcommunitynews.com}} of [[Pemberton, New Jersey]]
+
+* Montgomery Newspapers {{WS|montgomerynews.com}}
+** ''Ambler Gazette'' {{WS|amblergazette.com}} of [[Ambler, Pennsylvania]]
+** ''The Colonial'' {{WS|colonialnews.com}} of [[Plymouth Meeting, Pennsylvania]]
+** ''Glenside News'' {{WS|glensidenews.com}} of [[Glenside, Pennsylvania]]
+** ''The Globe'' {{WS|globenewspaper.com}} of [[Lower Moreland Township, Pennsylvania]]
+** ''Montgomery Life'' {{WS|montgomerylife.com}} of [[Fort Washington, Pennsylvania]]
+** ''North Penn Life'' {{WS|northpennlife.com}} of [[Lansdale, Pennsylvania]]
+** ''Perkasie News Herald'' {{WS|perkasienewsherald.com}} of [[Perkasie, Pennsylvania]]
+** ''Public Spirit'' {{WS|thepublicspirit.com}} of [[Hatboro, Pennsylvania]]
+** ''Souderton Independent'' {{WS|soudertonindependent.com}} of [[Souderton, Pennsylvania]]
+** ''Springfield Sun'' {{WS|springfieldsun.com}} of [[Springfield, Pennsylvania]]
+** ''Spring-Ford Reporter'' {{WS|springfordreporter.com}} of [[Royersford, Pennsylvania]]
+** ''Times Chronicle'' {{WS|thetimeschronicle.com}} of [[Jenkintown, Pennsylvania]]
+** ''Valley Item'' {{WS|valleyitem.com}} of [[Perkiomenville, Pennsylvania]]
+** ''Willow Grove Guide'' {{WS|willowgroveguide.com}} of [[Willow Grove, Pennsylvania]]
+** ''The Review'' {{WS|roxreview.com}} of [[Roxborough, Philadelphia, Pennsylvania]]
+
+* Main Line Media News {{WS|mainlinemedianews.com}}
+** ''Main Line Times'' {{WS|mainlinetimes.com}} of [[Ardmore, Pennsylvania]]
+** ''Main Line Life'' {{WS|mainlinelife.com}} of [[Ardmore, Pennsylvania]]
+** ''The King of Prussia Courier'' {{WS|kingofprussiacourier.com}} of [[King of Prussia, Pennsylvania]]
+
+* Delaware County News Network {{WS|delconewsnetwork.com}}
+** ''News of Delaware County'' {{WS|newsofdelawarecounty.com}} of [[Havertown, Pennsylvania]]
+** ''County Press'' {{WS|countypressonline.com}} of [[Newtown Square, Pennsylvania]]
+** ''Garnet Valley Press'' {{WS|countypressonline.com}} of [[Glen Mills, Pennsylvania]]
+** ''Springfield Press'' {{WS|countypressonline.com}} of [[Springfield, Pennsylvania]]
+** ''Town Talk'' {{WS|towntalknews.com}} of [[Ridley, Pennsylvania]]
+
+* Berks-Mont Newspapers {{WS|berksmontnews.com}}
+** ''The Boyertown Area Times'' {{WS|berksmontnews.com/boyertown_area_times}} of [[Boyertown, Pennsylvania]]
+** ''The Kutztown Area Patriot'' {{WS|berksmontnews.com/kutztown_area_patriot}} of [[Kutztown, Pennsylvania]]
+** ''The Hamburg Area Item'' {{WS|berksmontnews.com/hamburg_area_item}} of [[Hamburg, Pennsylvania]]
+** ''The Southern Berks News'' {{WS|berksmontnews.com/southern_berks_news}} of [[Exeter Township, Berks County, Pennsylvania]]
+** ''Community Connection'' {{WS|berksmontnews.com/community_connection}} of [[Boyertown, Pennsylvania]]
+
+* Magazines
+** ''Bucks Co. Town & Country Living'' {{WS|buckscountymagazine.com}}
+** ''Parents Express'' {{WS|parents-express.com}}
+** ''Real Men, Rednecks'' {{WS|realmenredneck.com}}
+
+{{JRC}}
+
+==References==
+<references />
+
+[[Category:Journal Register publications|*]]
diff --git a/src/find.rs b/src/find.rs
new file mode 100644
index 0000000..4af3b8b
--- /dev/null
+++ b/src/find.rs
@@ -0,0 +1,232 @@
+// The strstr implementation in this file is extracted from the Rust standard
+// library's str::find. The algorithm works for arbitrary &[T] haystack and
+// needle but is only exposed by the standard library on UTF-8 strings.
+//
+// https://github.com/rust-lang/rust/blob/1.40.0/src/libcore/str/pattern.rs
+//
+// ---
+//
+// This is the Two-Way search algorithm, which was introduced in the paper:
+// Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
+//
+// Here's some background information.
+//
+// A *word* is a string of symbols. The *length* of a word should be a familiar
+// notion, and here we denote it for any word x by |x|. (We also allow for the
+// possibility of the *empty word*, a word of length zero.)
+//
+// If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be
+// a *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] ==
+// x[i+p]. For example, both 1 and 2 are periods for the string "aa". As another
+// example, the only period of the string "abcd" is 4.
+//
+// We denote by period(x) the *smallest* period of x (provided that x is
+// non-empty). This is always well-defined since every non-empty word x has at
+// least one period, |x|. We sometimes call this *the period* of x.
+//
+// If u, v and x are words such that x = uv, where uv is the concatenation of u
+// and v, then we say that (u, v) is a *factorization* of x.
+//
+// Let (u, v) be a factorization for a word x. Then if w is a non-empty word
+// such that both of the following hold
+//
+// - either w is a suffix of u or u is a suffix of w
+// - either w is a prefix of v or v is a prefix of w
+//
+// then w is said to be a *repetition* for the factorization (u, v).
+//
+// Just to unpack this, there are four possibilities here. Let w = "abc". Then
+// we might have:
+//
+// - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
+// - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
+// - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
+// - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
+//
+// Note that the word vu is a repetition for any factorization (u,v) of x = uv,
+// so every factorization has at least one repetition.
+//
+// If x is a string and (u, v) is a factorization for x, then a *local period*
+// for (u, v) is an integer r such that there is some word w such that |w| = r
+// and w is a repetition for (u, v).
+//
+// We denote by local_period(u, v) the smallest local period of (u, v). We
+// sometimes call this *the local period* of (u, v). Provided that x = uv is
+// non-empty, this is well-defined (because each non-empty word has at least one
+// factorization, as noted above).
+//
+// It can be proven that the following is an equivalent definition of a local
+// period for a factorization (u, v): any positive integer r such that x[i] ==
+// x[i+r] for all i such that |u| - r <= i <= |u| - 1 and such that both x[i]
+// and x[i+r] are defined. (i.e., i > 0 and i + r < |x|).
+//
+// Using the above reformulation, it is easy to prove that
+//
+// 1 <= local_period(u, v) <= period(uv)
+//
+// A factorization (u, v) of x such that local_period(u,v) = period(x) is called
+// a *critical factorization*.
+//
+// The algorithm hinges on the following theorem, which is stated without proof:
+//
+// **Critical Factorization Theorem** Any word x has at least one critical
+// factorization (u, v) such that |u| < period(x).
+//
+// The purpose of maximal_suffix is to find such a critical factorization.
+//
+// If the period is short, compute another factorization x = u' v' to use for
+// reverse search, chosen instead so that |v'| < period(x).
+
+use std::cmp;
+use std::usize;
+
+pub fn find(haystack: &[char], needle: &[char]) -> Option<usize> {
+ assert!(!needle.is_empty());
+
+ // crit_pos: critical factorization index
+ let (crit_pos_false, period_false) = maximal_suffix(needle, false);
+ let (crit_pos_true, period_true) = maximal_suffix(needle, true);
+ let (crit_pos, mut period) = if crit_pos_false > crit_pos_true {
+ (crit_pos_false, period_false)
+ } else {
+ (crit_pos_true, period_true)
+ };
+
+ // Byteset is an extension (not part of the two way algorithm); it is a
+ // 64-bit "fingerprint" where each set bit j corresponds to a (byte & 63) ==
+ // j present in the needle.
+ let byteset;
+ // Index into needle before which we have already matched.
+ let mut memory;
+
+ // A particularly readable explanation of what's going on here can be found
+ // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
+ // see the code for "Algorithm CP" on p. 323.
+ //
+ // What's going on is we have some critical factorization (u, v) of the
+ // needle, and we want to determine whether u is a suffix of &v[..period].
+ // If it is, we use "Algorithm CP1". Otherwise we use "Algorithm CP2", which
+ // is optimized for when the period of the needle is large.
+ let long_period = needle[..crit_pos] != needle[period..period + crit_pos];
+ if long_period {
+ // Long period case -- we have an approximation to the actual period,
+ // and don't use memorization.
+ //
+ // Approximate the period by lower bound max(|u|, |v|) + 1.
+ period = cmp::max(crit_pos, needle.len() - crit_pos) + 1;
+ byteset = byteset_create(needle);
+ // Dummy value to signify that the period is long.
+ memory = usize::MAX;
+ } else {
+ // Short period case -- the period is exact.
+ byteset = byteset_create(&needle[..period]);
+ memory = 0;
+ }
+
+ // One of the main ideas of Two-Way is that we factorize the needle into two
+ // halves, (u, v), and begin trying to find v in the haystack by scanning
+ // left to right. If v matches, we try to match u by scanning right to left.
+ // How far we can jump when we encounter a mismatch is all based on the fact
+ // that (u, v) is a critical factorization for the needle.
+ let mut position = 0;
+ let needle_last = needle.len() - 1;
+ 'search: loop {
+ // Check that we have room to search in. position + needle_last cannot
+ // overflow if we assume slices are bounded by isize's range.
+ let tail_byte = *haystack.get(position + needle_last)?;
+
+ // Quickly skip by large portions unrelated to our substring.
+ if !byteset_contains(byteset, tail_byte) {
+ position += needle.len();
+ if !long_period {
+ memory = 0;
+ }
+ continue 'search;
+ }
+
+ // See if the right part of the needle matches.
+ let start = if long_period {
+ crit_pos
+ } else {
+ cmp::max(crit_pos, memory)
+ };
+ for i in start..needle.len() {
+ if needle[i] != haystack[position + i] {
+ position += i - crit_pos + 1;
+ if !long_period {
+ memory = 0;
+ }
+ continue 'search;
+ }
+ }
+
+ // See if the left part of the needle matches.
+ let start = if long_period { 0 } else { memory };
+ for i in (start..crit_pos).rev() {
+ if needle[i] != haystack[position + i] {
+ position += period;
+ if !long_period {
+ memory = needle.len() - period;
+ }
+ continue 'search;
+ }
+ }
+
+ // We have found a match!
+ return Some(position);
+ }
+}
+
+fn byteset_create(chars: &[char]) -> u64 {
+ chars.iter().fold(0, |a, &ch| (1 << (ch as u8 & 0x3f)) | a)
+}
+
+fn byteset_contains(byteset: u64, ch: char) -> bool {
+ (byteset >> ((ch as u8 & 0x3f) as usize)) & 1 != 0
+}
+
+// Compute the maximal suffix of `arr`.
+//
+// The maximal suffix is a possible critical factorization (u, v) of `arr`.
+//
+// Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
+// period of v.
+//
+// `order_greater` determines if lexical order is `<` or `>`. Both
+// orders must be computed -- the ordering with the largest `i` gives
+// a critical factorization.
+//
+// For long period cases, the resulting period is not exact (it is too short).
+fn maximal_suffix(arr: &[char], order_greater: bool) -> (usize, usize) {
+ let mut left = 0; // Corresponds to i in the paper
+ let mut right = 1; // Corresponds to j in the paper
+ let mut offset = 0; // Corresponds to k in the paper, but starting at 0
+ // to match 0-based indexing.
+ let mut period = 1; // Corresponds to p in the paper
+
+ while let Some(&a) = arr.get(right + offset) {
+ // `left` will be inbounds when `right` is.
+ let b = arr[left + offset];
+ if (a < b && !order_greater) || (a > b && order_greater) {
+ // Suffix is smaller, period is entire prefix so far.
+ right += offset + 1;
+ offset = 0;
+ period = right - left;
+ } else if a == b {
+ // Advance through repetition of the current period.
+ if offset + 1 == period {
+ right += offset + 1;
+ offset = 0;
+ } else {
+ offset += 1;
+ }
+ } else {
+ // Suffix is larger, start over from current location.
+ left = right;
+ right += 1;
+ offset = 0;
+ period = 1;
+ }
+ }
+ (left, period)
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..b66434a
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,935 @@
+//! [![github]](https://github.com/dtolnay/dissimilar)&ensp;[![crates-io]](https://crates.io/crates/dissimilar)&ensp;[![docs-rs]](https://docs.rs/dissimilar)
+//!
+//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
+//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
+//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
+//!
+//! <br>
+//!
+//! ## Diff library with semantic cleanup, based on Google's diff-match-patch
+//!
+//! This library is a port of the Diff component of [Diff Match Patch] to Rust.
+//! The diff implementation is based on [Myers' diff algorithm] but includes
+//! some [semantic cleanups] to increase human readability by factoring out
+//! commonalities which are likely to be coincidental.
+//!
+//! Diff Match Patch was originally built in 2006 to power Google Docs.
+//!
+//! # Interface
+//!
+//! Here is the entire API of the Rust implementation. It operates on borrowed
+//! strings and the return value of the diff algorithm is a vector of chunks
+//! pointing into slices of those input strings.
+//!
+//! ```
+//! pub enum Chunk<'a> {
+//! Equal(&'a str),
+//! Delete(&'a str),
+//! Insert(&'a str),
+//! }
+//!
+//! # const IGNORE: &str = stringify! {
+//! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
+//! # };
+//! ```
+//!
+//! [Diff Match Patch]: https://github.com/google/diff-match-patch
+//! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
+//! [semantic cleanups]: https://neil.fraser.name/writing/diff/
+
+#![doc(html_root_url = "https://docs.rs/dissimilar/1.0.6")]
+#![allow(
+ clippy::blocks_in_if_conditions,
+ clippy::bool_to_int_with_if,
+ clippy::cast_possible_wrap,
+ clippy::cast_sign_loss,
+ clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127
+ clippy::collapsible_else_if,
+ clippy::comparison_chain,
+ clippy::match_same_arms,
+ clippy::module_name_repetitions,
+ clippy::must_use_candidate,
+ clippy::new_without_default,
+ clippy::octal_escapes,
+ clippy::shadow_unrelated,
+ clippy::similar_names,
+ clippy::too_many_lines,
+ clippy::unseparated_literal_suffix,
+ unused_parens, // false positive on Some(&(mut diff)) pattern
+)]
+
+mod find;
+mod range;
+
+#[cfg(test)]
+mod tests;
+
+use crate::range::{slice, Range};
+use std::cmp;
+use std::collections::VecDeque;
+use std::fmt::{self, Debug, Display, Write};
+
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub enum Chunk<'a> {
+ Equal(&'a str),
+ Delete(&'a str),
+ Insert(&'a str),
+}
+
+#[derive(Copy, Clone)]
+enum Diff<'a, 'b> {
+ Equal(Range<'a>, Range<'b>),
+ Delete(Range<'a>),
+ Insert(Range<'b>),
+}
+
+impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> {
+ fn text(&self) -> Range<'tmp> {
+ match *self {
+ Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range,
+ }
+ }
+
+ fn grow_left(&mut self, increment: usize) {
+ self.for_each(|range| {
+ range.offset -= increment;
+ range.len += increment;
+ });
+ }
+
+ fn grow_right(&mut self, increment: usize) {
+ self.for_each(|range| range.len += increment);
+ }
+
+ fn shift_left(&mut self, increment: usize) {
+ self.for_each(|range| range.offset -= increment);
+ }
+
+ fn shift_right(&mut self, increment: usize) {
+ self.for_each(|range| range.offset += increment);
+ }
+
+ fn for_each(&mut self, f: impl Fn(&mut Range)) {
+ match self {
+ Diff::Equal(range1, range2) => {
+ f(range1);
+ f(range2);
+ }
+ Diff::Delete(range) => f(range),
+ Diff::Insert(range) => f(range),
+ }
+ }
+}
+
+pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> {
+ let chars1: Vec<char> = text1.chars().collect();
+ let chars2: Vec<char> = text2.chars().collect();
+ let range1 = Range::new(&chars1, ..);
+ let range2 = Range::new(&chars2, ..);
+
+ let mut solution = main(range1, range2);
+ cleanup_char_boundary(&mut solution);
+ cleanup_semantic(&mut solution);
+ cleanup_merge(&mut solution);
+
+ let mut chunks = Vec::new();
+ let mut pos1 = 0;
+ let mut pos2 = 0;
+ for diff in solution.diffs {
+ chunks.push(match diff {
+ Diff::Equal(range, _) => {
+ let len = range.len_bytes();
+ let chunk = Chunk::Equal(&text1[pos1..pos1 + len]);
+ pos1 += len;
+ pos2 += len;
+ chunk
+ }
+ Diff::Delete(range) => {
+ let len = range.len_bytes();
+ let chunk = Chunk::Delete(&text1[pos1..pos1 + len]);
+ pos1 += len;
+ chunk
+ }
+ Diff::Insert(range) => {
+ let len = range.len_bytes();
+ let chunk = Chunk::Insert(&text2[pos2..pos2 + len]);
+ pos2 += len;
+ chunk
+ }
+ });
+ }
+ chunks
+}
+
+struct Solution<'a, 'b> {
+ text1: Range<'a>,
+ text2: Range<'b>,
+ diffs: Vec<Diff<'a, 'b>>,
+}
+
+fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> {
+ let whole1 = text1;
+ let whole2 = text2;
+
+ // Trim off common prefix.
+ let common_prefix_len = common_prefix(text1, text2);
+ let common_prefix = Diff::Equal(
+ text1.substring(..common_prefix_len),
+ text2.substring(..common_prefix_len),
+ );
+ text1 = text1.substring(common_prefix_len..);
+ text2 = text2.substring(common_prefix_len..);
+
+ // Trim off common suffix.
+ let common_suffix_len = common_suffix(text1, text2);
+ let common_suffix = Diff::Equal(
+ text1.substring(text1.len - common_suffix_len..),
+ text2.substring(text2.len - common_suffix_len..),
+ );
+ text1 = text1.substring(..text1.len - common_suffix_len);
+ text2 = text2.substring(..text2.len - common_suffix_len);
+
+ // Compute the diff on the middle block.
+ let mut solution = Solution {
+ text1: whole1,
+ text2: whole2,
+ diffs: compute(text1, text2),
+ };
+
+ // Restore the prefix and suffix.
+ if common_prefix_len > 0 {
+ solution.diffs.insert(0, common_prefix);
+ }
+ if common_suffix_len > 0 {
+ solution.diffs.push(common_suffix);
+ }
+
+ cleanup_merge(&mut solution);
+
+ solution
+}
+
+// Find the differences between two texts. Assumes that the texts do not have
+// any common prefix or suffix.
+fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
+ match (text1.is_empty(), text2.is_empty()) {
+ (true, true) => return Vec::new(),
+ (true, false) => return vec![Diff::Insert(text2)],
+ (false, true) => return vec![Diff::Delete(text1)],
+ (false, false) => {}
+ }
+
+ // Check for entire shorter text inside the longer text.
+ if text1.len > text2.len {
+ if let Some(i) = text1.find(text2) {
+ return vec![
+ Diff::Delete(text1.substring(..i)),
+ Diff::Equal(text1.substring(i..i + text2.len), text2),
+ Diff::Delete(text1.substring(i + text2.len..)),
+ ];
+ }
+ } else {
+ if let Some(i) = text2.find(text1) {
+ return vec![
+ Diff::Insert(text2.substring(..i)),
+ Diff::Equal(text1, text2.substring(i..i + text1.len)),
+ Diff::Insert(text2.substring(i + text1.len..)),
+ ];
+ }
+ }
+
+ if text1.len == 1 || text2.len == 1 {
+ // Single character string.
+ // After the previous check, the character can't be an equality.
+ return vec![Diff::Delete(text1), Diff::Insert(text2)];
+ }
+
+ bisect(text1, text2)
+}
+
+// Find the 'middle snake' of a diff, split the problem in two and return the
+// recursively constructed diff.
+//
+// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
+fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
+ let max_d = (text1.len + text2.len + 1) / 2;
+ let v_offset = max_d;
+ let v_len = 2 * max_d;
+ let mut v1 = vec![-1isize; v_len];
+ let mut v2 = vec![-1isize; v_len];
+ v1[v_offset + 1] = 0;
+ v2[v_offset + 1] = 0;
+ let delta = text1.len as isize - text2.len as isize;
+ // If the total number of characters is odd, then the front path will
+ // collide with the reverse path.
+ let front = delta % 2 != 0;
+ // Offsets for start and end of k loop.
+ // Prevents mapping of space beyond the grid.
+ let mut k1start = 0;
+ let mut k1end = 0;
+ let mut k2start = 0;
+ let mut k2end = 0;
+ for d in 0..max_d as isize {
+ // Walk the front path one step.
+ let mut k1 = -d + k1start;
+ while k1 <= d - k1end {
+ let k1_offset = (v_offset as isize + k1) as usize;
+ let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) {
+ v1[k1_offset + 1]
+ } else {
+ v1[k1_offset - 1] + 1
+ } as usize;
+ let mut y1 = (x1 as isize - k1) as usize;
+ if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) {
+ let advance = common_prefix(s1, s2);
+ x1 += advance;
+ y1 += advance;
+ }
+ v1[k1_offset] = x1 as isize;
+ if x1 > text1.len {
+ // Ran off the right of the graph.
+ k1end += 2;
+ } else if y1 > text2.len {
+ // Ran off the bottom of the graph.
+ k1start += 2;
+ } else if front {
+ let k2_offset = v_offset as isize + delta - k1;
+ if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 {
+ // Mirror x2 onto top-left coordinate system.
+ let x2 = text1.len as isize - v2[k2_offset as usize];
+ if x1 as isize >= x2 {
+ // Overlap detected.
+ return bisect_split(text1, text2, x1, y1);
+ }
+ }
+ }
+ k1 += 2;
+ }
+
+ // Walk the reverse path one step.
+ let mut k2 = -d + k2start;
+ while k2 <= d - k2end {
+ let k2_offset = (v_offset as isize + k2) as usize;
+ let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) {
+ v2[k2_offset + 1]
+ } else {
+ v2[k2_offset - 1] + 1
+ } as usize;
+ let mut y2 = (x2 as isize - k2) as usize;
+ if x2 < text1.len && y2 < text2.len {
+ let advance = common_suffix(
+ text1.substring(..text1.len - x2),
+ text2.substring(..text2.len - y2),
+ );
+ x2 += advance;
+ y2 += advance;
+ }
+ v2[k2_offset] = x2 as isize;
+ if x2 > text1.len {
+ // Ran off the left of the graph.
+ k2end += 2;
+ } else if y2 > text2.len {
+ // Ran off the top of the graph.
+ k2start += 2;
+ } else if !front {
+ let k1_offset = v_offset as isize + delta - k2;
+ if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 {
+ let x1 = v1[k1_offset as usize] as usize;
+ let y1 = v_offset + x1 - k1_offset as usize;
+ // Mirror x2 onto top-left coordinate system.
+ x2 = text1.len - x2;
+ if x1 >= x2 {
+ // Overlap detected.
+ return bisect_split(text1, text2, x1, y1);
+ }
+ }
+ }
+ k2 += 2;
+ }
+ }
+ // Number of diffs equals number of characters, no commonality at all.
+ vec![Diff::Delete(text1), Diff::Insert(text2)]
+}
+
+// Given the location of the 'middle snake', split the diff in two parts and
+// recurse.
+fn bisect_split<'a, 'b>(
+ text1: Range<'a>,
+ text2: Range<'b>,
+ x: usize,
+ y: usize,
+) -> Vec<Diff<'a, 'b>> {
+ let (text1a, text1b) = text1.split_at(x);
+ let (text2a, text2b) = text2.split_at(y);
+
+ // Compute both diffs serially.
+ let mut diffs = main(text1a, text2a).diffs;
+ diffs.extend(main(text1b, text2b).diffs);
+
+ diffs
+}
+
+// Determine the length of the common prefix of two strings.
+fn common_prefix(text1: Range, text2: Range) -> usize {
+ for (i, (b1, b2)) in text1.chars().zip(text2.chars()).enumerate() {
+ if b1 != b2 {
+ return i;
+ }
+ }
+ cmp::min(text1.len, text2.len)
+}
+
+// Determine the length of the common suffix of two strings.
+fn common_suffix(text1: Range, text2: Range) -> usize {
+ for (i, (b1, b2)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() {
+ if b1 != b2 {
+ return i;
+ }
+ }
+ cmp::min(text1.len, text2.len)
+}
+
+// Determine if the suffix of one string is the prefix of another.
+//
+// Returns the number of characters common to the end of the first string and
+// the start of the second string.
+fn common_overlap(mut text1: Range, mut text2: Range) -> usize {
+ // Eliminate the null case.
+ if text1.is_empty() || text2.is_empty() {
+ return 0;
+ }
+ // Truncate the longer string.
+ if text1.len > text2.len {
+ text1 = text1.substring(text1.len - text2.len..);
+ } else if text1.len < text2.len {
+ text2 = text2.substring(..text1.len);
+ }
+ // Quick check for the worst case.
+ if slice(text1) == slice(text2) {
+ return text1.len;
+ }
+
+ // Start by looking for a single character match
+ // and increase length until no match is found.
+ // Performance analysis: https://neil.fraser.name/news/2010/11/04/
+ let mut best = 0;
+ let mut length = 1;
+ loop {
+ let pattern = text1.substring(text1.len - length..);
+ let found = match text2.find(pattern) {
+ Some(found) => found,
+ None => return best,
+ };
+ length += found;
+ if found == 0
+ || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length))
+ {
+ best = length;
+ length += 1;
+ }
+ }
+}
+
+fn cleanup_char_boundary(solution: &mut Solution) {
+ fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool {
+ // FIXME: use unicode-segmentation crate?
+ let _ = doc;
+ let _ = pos;
+ true
+ }
+
+ fn boundary_down(doc: &[char], pos: usize) -> usize {
+ let mut adjust = 0;
+ while !is_segmentation_boundary(doc, pos - adjust) {
+ adjust += 1;
+ }
+ adjust
+ }
+
+ fn boundary_up(doc: &[char], pos: usize) -> usize {
+ let mut adjust = 0;
+ while !is_segmentation_boundary(doc, pos + adjust) {
+ adjust += 1;
+ }
+ adjust
+ }
+
+ fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) {
+ let prev_end = prev.offset + prev.len;
+ if prev_end > range.offset {
+ let delta = cmp::min(prev_end - range.offset, range.len);
+ range.offset += delta;
+ range.len -= delta;
+ }
+ }
+
+ let mut read = 0;
+ let mut retain = 0;
+ let mut last_delete = Range::empty();
+ let mut last_insert = Range::empty();
+ while let Some(&(mut diff)) = solution.diffs.get(read) {
+ read += 1;
+ match &mut diff {
+ Diff::Equal(range1, range2) => {
+ let adjust = boundary_up(range1.doc, range1.offset);
+ // If the whole range is sub-character, skip it.
+ if range1.len <= adjust {
+ continue;
+ }
+ range1.offset += adjust;
+ range1.len -= adjust;
+ range2.offset += adjust;
+ range2.len -= adjust;
+ let adjust = boundary_down(range1.doc, range1.offset + range1.len);
+ range1.len -= adjust;
+ range2.len -= adjust;
+ last_delete = Range::empty();
+ last_insert = Range::empty();
+ }
+ Diff::Delete(range) => {
+ skip_overlap(&last_delete, range);
+ if range.len == 0 {
+ continue;
+ }
+ let adjust = boundary_down(range.doc, range.offset);
+ range.offset -= adjust;
+ range.len += adjust;
+ let adjust = boundary_up(range.doc, range.offset + range.len);
+ range.len += adjust;
+ last_delete = *range;
+ }
+ Diff::Insert(range) => {
+ skip_overlap(&last_insert, range);
+ if range.len == 0 {
+ continue;
+ }
+ let adjust = boundary_down(range.doc, range.offset);
+ range.offset -= adjust;
+ range.len += adjust;
+ let adjust = boundary_up(range.doc, range.offset + range.len);
+ range.len += adjust;
+ last_insert = *range;
+ }
+ }
+ solution.diffs[retain] = diff;
+ retain += 1;
+ }
+
+ solution.diffs.truncate(retain);
+}
+
+// Reduce the number of edits by eliminating semantically trivial equalities.
+fn cleanup_semantic(solution: &mut Solution) {
+ let mut diffs = &mut solution.diffs;
+ if diffs.is_empty() {
+ return;
+ }
+
+ let mut changes = false;
+ let mut equalities = VecDeque::new(); // Double-ended queue of equalities.
+ let mut last_equality = None; // Always equal to equalities.peek().text
+ let mut pointer = 0;
+ // Number of characters that changed prior to the equality.
+ let mut len_insertions1 = 0;
+ let mut len_deletions1 = 0;
+ // Number of characters that changed after the equality.
+ let mut len_insertions2 = 0;
+ let mut len_deletions2 = 0;
+ while let Some(&this_diff) = diffs.get(pointer) {
+ match this_diff {
+ Diff::Equal(text1, text2) => {
+ equalities.push_back(pointer);
+ len_insertions1 = len_insertions2;
+ len_deletions1 = len_deletions2;
+ len_insertions2 = 0;
+ len_deletions2 = 0;
+ last_equality = Some((text1, text2));
+ pointer += 1;
+ continue;
+ }
+ Diff::Delete(text) => len_deletions2 += text.len,
+ Diff::Insert(text) => len_insertions2 += text.len,
+ }
+ // Eliminate an equality that is smaller or equal to the edits on both
+ // sides of it.
+ if last_equality.map_or(false, |(last_equality, _)| {
+ last_equality.len <= cmp::max(len_insertions1, len_deletions1)
+ && last_equality.len <= cmp::max(len_insertions2, len_deletions2)
+ }) {
+ // Jump back to offending equality.
+ pointer = equalities.pop_back().unwrap();
+
+ // Replace equality with a delete.
+ diffs[pointer] = Diff::Delete(last_equality.unwrap().0);
+ // Insert a corresponding insert.
+ diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1));
+
+ len_insertions1 = 0; // Reset the counters.
+ len_insertions2 = 0;
+ len_deletions1 = 0;
+ len_deletions2 = 0;
+ last_equality = None;
+ changes = true;
+
+ // Throw away the previous equality (it needs to be reevaluated).
+ equalities.pop_back();
+ if let Some(back) = equalities.back() {
+ // There is a safe equality we can fall back to.
+ pointer = *back;
+ } else {
+ // There are no previous equalities, jump back to the start.
+ pointer = 0;
+ continue;
+ }
+ }
+ pointer += 1;
+ }
+
+ // Normalize the diff.
+ if changes {
+ cleanup_merge(solution);
+ }
+ cleanup_semantic_lossless(solution);
+ diffs = &mut solution.diffs;
+
+ // Find any overlaps between deletions and insertions.
+ // e.g: <del>abcxxx</del><ins>xxxdef</ins>
+ // -> <del>abc</del>xxx<ins>def</ins>
+ // e.g: <del>xxxabc</del><ins>defxxx</ins>
+ // -> <ins>def</ins>xxx<del>abc</del>
+ // Only extract an overlap if it is as big as the edit ahead or behind it.
+ let mut pointer = 1;
+ while let Some(&this_diff) = diffs.get(pointer) {
+ let prev_diff = diffs[pointer - 1];
+ if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) {
+ let overlap_len1 = common_overlap(deletion, insertion);
+ let overlap_len2 = common_overlap(insertion, deletion);
+ let overlap_min = cmp::min(deletion.len, insertion.len);
+ if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min {
+ // Overlap found. Insert an equality and trim the surrounding edits.
+ diffs.insert(
+ pointer,
+ Diff::Equal(
+ deletion.substring(deletion.len - overlap_len1..deletion.len),
+ insertion.substring(..overlap_len1),
+ ),
+ );
+ diffs[pointer - 1] =
+ Diff::Delete(deletion.substring(..deletion.len - overlap_len1));
+ diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..));
+ } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min {
+ // Reverse overlap found.
+ // Insert an equality and swap and trim the surrounding edits.
+ diffs.insert(
+ pointer,
+ Diff::Equal(
+ deletion.substring(..overlap_len2),
+ insertion.substring(insertion.len - overlap_len2..insertion.len),
+ ),
+ );
+ diffs[pointer - 1] =
+ Diff::Insert(insertion.substring(..insertion.len - overlap_len2));
+ diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..));
+ }
+ pointer += 1;
+ }
+ pointer += 1;
+ }
+}
+
+// Look for single edits surrounded on both sides by equalities which can be
+// shifted sideways to align the edit to a word boundary.
+//
+// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
+fn cleanup_semantic_lossless(solution: &mut Solution) {
+ let diffs = &mut solution.diffs;
+ let mut pointer = 1;
+ while let Some(&next_diff) = diffs.get(pointer + 1) {
+ let prev_diff = diffs[pointer - 1];
+ if let (
+ Diff::Equal(mut prev_equal1, mut prev_equal2),
+ Diff::Equal(mut next_equal1, mut next_equal2),
+ ) = (prev_diff, next_diff)
+ {
+ // This is a single edit surrounded by equalities.
+ let mut edit = diffs[pointer];
+
+ // First, shift the edit as far left as possible.
+ let common_offset = common_suffix(prev_equal1, edit.text());
+ let original_prev_len = prev_equal1.len;
+ prev_equal1.len -= common_offset;
+ prev_equal2.len -= common_offset;
+ edit.shift_left(common_offset);
+ next_equal1.offset -= common_offset;
+ next_equal1.len += common_offset;
+ next_equal2.offset -= common_offset;
+ next_equal2.len += common_offset;
+
+ // Second, step character by character right, looking for the best fit.
+ let mut best_prev_equal = (prev_equal1, prev_equal2);
+ let mut best_edit = edit;
+ let mut best_next_equal = (next_equal1, next_equal2);
+ let mut best_score = cleanup_semantic_score(prev_equal1, edit.text())
+ + cleanup_semantic_score(edit.text(), next_equal1);
+ while !edit.text().is_empty()
+ && !next_equal1.is_empty()
+ && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap()
+ {
+ prev_equal1.len += 1;
+ prev_equal2.len += 1;
+ edit.shift_right(1);
+ next_equal1.offset += 1;
+ next_equal1.len -= 1;
+ next_equal2.offset += 1;
+ next_equal2.len -= 1;
+ let score = cleanup_semantic_score(prev_equal1, edit.text())
+ + cleanup_semantic_score(edit.text(), next_equal1);
+ // The >= encourages trailing rather than leading whitespace on edits.
+ if score >= best_score {
+ best_score = score;
+ best_prev_equal = (prev_equal1, prev_equal2);
+ best_edit = edit;
+ best_next_equal = (next_equal1, next_equal2);
+ }
+ }
+
+ if original_prev_len != best_prev_equal.0.len {
+ // We have an improvement, save it back to the diff.
+ if best_next_equal.0.is_empty() {
+ diffs.remove(pointer + 1);
+ } else {
+ diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1);
+ }
+ diffs[pointer] = best_edit;
+ if best_prev_equal.0.is_empty() {
+ diffs.remove(pointer - 1);
+ pointer -= 1;
+ } else {
+ diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1);
+ }
+ }
+ }
+ pointer += 1;
+ }
+}
+
+// Given two strings, compute a score representing whether the internal boundary
+// falls on logical boundaries.
+//
+// Scores range from 6 (best) to 0 (worst).
+fn cleanup_semantic_score(one: Range, two: Range) -> usize {
+ if one.is_empty() || two.is_empty() {
+ // Edges are the best.
+ return 6;
+ }
+
+ // Each port of this function behaves slightly differently due to subtle
+ // differences in each language's definition of things like 'whitespace'.
+ // Since this function's purpose is largely cosmetic, the choice has been
+ // made to use each language's native features rather than force total
+ // conformity.
+ let char1 = one.chars().next_back().unwrap();
+ let char2 = two.chars().next().unwrap();
+ let non_alphanumeric1 = !char1.is_ascii_alphanumeric();
+ let non_alphanumeric2 = !char2.is_ascii_alphanumeric();
+ let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace();
+ let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace();
+ let line_break1 = whitespace1 && char1.is_control();
+ let line_break2 = whitespace2 && char2.is_control();
+ let blank_line1 =
+ line_break1 && (one.ends_with(['\n', '\n']) || one.ends_with(['\n', '\r', '\n']));
+ let blank_line2 =
+ line_break2 && (two.starts_with(['\n', '\n']) || two.starts_with(['\r', '\n', '\r', '\n']));
+
+ if blank_line1 || blank_line2 {
+ // Five points for blank lines.
+ 5
+ } else if line_break1 || line_break2 {
+ // Four points for line breaks.
+ 4
+ } else if non_alphanumeric1 && !whitespace1 && whitespace2 {
+ // Three points for end of sentences.
+ 3
+ } else if whitespace1 || whitespace2 {
+ // Two points for whitespace.
+ 2
+ } else if non_alphanumeric1 || non_alphanumeric2 {
+ // One point for non-alphanumeric.
+ 1
+ } else {
+ 0
+ }
+}
+
+// Reorder and merge like edit sections. Merge equalities. Any edit section can
+// move as long as it doesn't cross an equality.
+fn cleanup_merge(solution: &mut Solution) {
+ let diffs = &mut solution.diffs;
+ while !diffs.is_empty() {
+ diffs.push(Diff::Equal(
+ solution.text1.substring(solution.text1.len..),
+ solution.text2.substring(solution.text2.len..),
+ )); // Add a dummy entry at the end.
+ let mut pointer = 0;
+ let mut count_delete = 0;
+ let mut count_insert = 0;
+ let mut text_delete = Range::empty();
+ let mut text_insert = Range::empty();
+ while let Some(&this_diff) = diffs.get(pointer) {
+ match this_diff {
+ Diff::Insert(text) => {
+ count_insert += 1;
+ if text_insert.is_empty() {
+ text_insert = text;
+ } else {
+ text_insert.len += text.len;
+ }
+ }
+ Diff::Delete(text) => {
+ count_delete += 1;
+ if text_delete.is_empty() {
+ text_delete = text;
+ } else {
+ text_delete.len += text.len;
+ }
+ }
+ Diff::Equal(text, _) => {
+ let count_both = count_delete + count_insert;
+ if count_both > 1 {
+ let both_types = count_delete != 0 && count_insert != 0;
+ // Delete the offending records.
+ diffs.splice(pointer - count_both..pointer, None);
+ pointer -= count_both;
+ if both_types {
+ // Factor out any common prefix.
+ let common_length = common_prefix(text_insert, text_delete);
+ if common_length != 0 {
+ if pointer > 0 {
+ match &mut diffs[pointer - 1] {
+ Diff::Equal(this_diff1, this_diff2) => {
+ this_diff1.len += common_length;
+ this_diff2.len += common_length;
+ }
+ _ => unreachable!(
+ "previous diff should have been an equality"
+ ),
+ }
+ } else {
+ diffs.insert(
+ pointer,
+ Diff::Equal(
+ text_delete.substring(..common_length),
+ text_insert.substring(..common_length),
+ ),
+ );
+ pointer += 1;
+ }
+ text_insert = text_insert.substring(common_length..);
+ text_delete = text_delete.substring(common_length..);
+ }
+ // Factor out any common suffix.
+ let common_length = common_suffix(text_insert, text_delete);
+ if common_length != 0 {
+ diffs[pointer].grow_left(common_length);
+ text_insert.len -= common_length;
+ text_delete.len -= common_length;
+ }
+ }
+ // Insert the merged records.
+ if !text_delete.is_empty() {
+ diffs.insert(pointer, Diff::Delete(text_delete));
+ pointer += 1;
+ }
+ if !text_insert.is_empty() {
+ diffs.insert(pointer, Diff::Insert(text_insert));
+ pointer += 1;
+ }
+ } else if pointer > 0 {
+ if let Some(Diff::Equal(prev_equal1, prev_equal2)) =
+ diffs.get_mut(pointer - 1)
+ {
+ // Merge this equality with the previous one.
+ prev_equal1.len += text.len;
+ prev_equal2.len += text.len;
+ diffs.remove(pointer);
+ pointer -= 1;
+ }
+ }
+ count_insert = 0;
+ count_delete = 0;
+ text_delete = Range::empty();
+ text_insert = Range::empty();
+ }
+ }
+ pointer += 1;
+ }
+ if diffs.last().unwrap().text().is_empty() {
+ diffs.pop(); // Remove the dummy entry at the end.
+ }
+
+ // Second pass: look for single edits surrounded on both sides by equalities
+ // which can be shifted sideways to eliminate an equality.
+ // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
+ let mut changes = false;
+ let mut pointer = 1;
+ // Intentionally ignore the first and last element (don't need checking).
+ while let Some(&next_diff) = diffs.get(pointer + 1) {
+ let prev_diff = diffs[pointer - 1];
+ let this_diff = diffs[pointer];
+ if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) {
+ // This is a single edit surrounded by equalities.
+ if this_diff.text().ends_with(prev_diff) {
+ // Shift the edit over the previous equality.
+ diffs[pointer].shift_left(prev_diff.len);
+ diffs[pointer + 1].grow_left(prev_diff.len);
+ diffs.remove(pointer - 1); // Delete prev_diff.
+ changes = true;
+ } else if this_diff.text().starts_with(next_diff) {
+ // Shift the edit over the next equality.
+ diffs[pointer - 1].grow_right(next_diff.len);
+ diffs[pointer].shift_right(next_diff.len);
+ diffs.remove(pointer + 1); // Delete next_diff.
+ changes = true;
+ }
+ }
+ pointer += 1;
+ }
+ // If shifts were made, the diff needs reordering and another shift sweep.
+ if !changes {
+ return;
+ }
+ }
+}
+
+impl Debug for Chunk<'_> {
+ fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+ let (name, text) = match *self {
+ Chunk::Equal(text) => ("Equal", text),
+ Chunk::Delete(text) => ("Delete", text),
+ Chunk::Insert(text) => ("Insert", text),
+ };
+ write!(formatter, "{}({:?})", name, text)
+ }
+}
+
+impl Debug for Diff<'_, '_> {
+ fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+ let (name, range) = match *self {
+ Diff::Equal(range, _) => ("Equal", range),
+ Diff::Delete(range) => ("Delete", range),
+ Diff::Insert(range) => ("Insert", range),
+ };
+ formatter.write_str(name)?;
+ formatter.write_str("(\"")?;
+ for ch in range.chars() {
+ if ch == '\'' {
+ // escape_debug turns this into "\'" which is unnecessary.
+ formatter.write_char(ch)?;
+ } else {
+ Display::fmt(&ch.escape_debug(), formatter)?;
+ }
+ }
+ formatter.write_str("\")")?;
+ Ok(())
+ }
+}
diff --git a/src/range.rs b/src/range.rs
new file mode 100644
index 0000000..55cbc44
--- /dev/null
+++ b/src/range.rs
@@ -0,0 +1,141 @@
+use crate::find::find;
+use std::fmt::Debug;
+use std::ops::{self, RangeFrom, RangeFull, RangeTo};
+
+#[derive(Copy, Clone)]
+pub struct Range<'a> {
+ pub doc: &'a [char],
+ pub offset: usize,
+ pub len: usize,
+}
+
+impl<'a> Range<'a> {
+ pub fn empty() -> Self {
+ Range {
+ doc: &[],
+ offset: 0,
+ len: 0,
+ }
+ }
+
+ pub fn new(doc: &'a [char], bounds: impl RangeBounds) -> Self {
+ let (offset, len) = bounds.index(doc.len());
+ Range { doc, offset, len }
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ pub fn len_bytes(&self) -> usize {
+ self.chars().map(char::len_utf8).sum()
+ }
+
+ pub fn substring(&self, bounds: impl RangeBounds) -> Self {
+ let (offset, len) = bounds.index(self.len);
+ Range {
+ doc: self.doc,
+ offset: self.offset + offset,
+ len,
+ }
+ }
+
+ pub fn get(&self, bounds: impl RangeBounds) -> Option<Self> {
+ let (offset, len) = bounds.try_index(self.len)?;
+ Some(Range {
+ doc: self.doc,
+ offset: self.offset + offset,
+ len,
+ })
+ }
+
+ pub fn split_at(&self, mid: usize) -> (Self, Self) {
+ (self.substring(..mid), self.substring(mid..))
+ }
+
+ pub fn chars(
+ &self,
+ ) -> impl Iterator<Item = char> + DoubleEndedIterator + ExactSizeIterator + 'a {
+ slice(*self).iter().copied()
+ }
+
+ pub fn starts_with(&self, prefix: impl AsRef<[char]>) -> bool {
+ slice(*self).starts_with(prefix.as_ref())
+ }
+
+ pub fn ends_with(&self, suffix: impl AsRef<[char]>) -> bool {
+ slice(*self).ends_with(suffix.as_ref())
+ }
+
+ pub fn find(&self, needle: impl AsRef<[char]>) -> Option<usize> {
+ find(slice(*self), needle.as_ref())
+ }
+}
+
+pub fn slice(range: Range) -> &[char] {
+ if cfg!(debug)
+ && range
+ .doc
+ .get(range.offset..range.offset + range.len)
+ .is_none()
+ {
+ eprintln!(
+ "doc={:?} offset={} len={}",
+ range.doc, range.offset, range.len
+ );
+ }
+ &range.doc[range.offset..range.offset + range.len]
+}
+
+impl AsRef<[char]> for Range<'_> {
+ fn as_ref(&self) -> &[char] {
+ slice(*self)
+ }
+}
+
+pub trait RangeBounds: Sized + Clone + Debug {
+ // Returns (offset, len).
+ fn try_index(self, len: usize) -> Option<(usize, usize)>;
+ fn index(self, len: usize) -> (usize, usize) {
+ match self.clone().try_index(len) {
+ Some(range) => range,
+ None => panic!("index out of range, index={:?}, len={}", self, len),
+ }
+ }
+}
+
+impl RangeBounds for ops::Range<usize> {
+ fn try_index(self, len: usize) -> Option<(usize, usize)> {
+ if self.start <= self.end && self.end <= len {
+ Some((self.start, self.end - self.start))
+ } else {
+ None
+ }
+ }
+}
+
+impl RangeBounds for RangeFrom<usize> {
+ fn try_index(self, len: usize) -> Option<(usize, usize)> {
+ if self.start <= len {
+ Some((self.start, len - self.start))
+ } else {
+ None
+ }
+ }
+}
+
+impl RangeBounds for RangeTo<usize> {
+ fn try_index(self, len: usize) -> Option<(usize, usize)> {
+ if self.end <= len {
+ Some((0, self.end))
+ } else {
+ None
+ }
+ }
+}
+
+impl RangeBounds for RangeFull {
+ fn try_index(self, len: usize) -> Option<(usize, usize)> {
+ Some((0, len))
+ }
+}
diff --git a/src/tests.rs b/src/tests.rs
new file mode 100644
index 0000000..d2e3fd6
--- /dev/null
+++ b/src/tests.rs
@@ -0,0 +1,591 @@
+use super::*;
+use once_cell::sync::OnceCell;
+
+macro_rules! range {
+ ($text:expr) => {{
+ static CHARS: OnceCell<Vec<char>> = OnceCell::new();
+ let chars = CHARS.get_or_init(|| $text.chars().collect());
+ Range::new(chars, ..)
+ }};
+}
+
+macro_rules! diff_list {
+ () => {
+ Solution {
+ text1: Range::empty(),
+ text2: Range::empty(),
+ diffs: Vec::new(),
+ }
+ };
+ ($($kind:ident($text:literal)),+ $(,)?) => {{
+ #[allow(unused_macro_rules)]
+ macro_rules! text1 {
+ (Insert, $s:literal) => { "" };
+ (Delete, $s:literal) => { $s };
+ (Equal, $s:literal) => { $s };
+ }
+ #[allow(unused_macro_rules)]
+ macro_rules! text2 {
+ (Insert, $s:literal) => { $s };
+ (Delete, $s:literal) => { "" };
+ (Equal, $s:literal) => { $s };
+ }
+ let text1 = range!(concat!($(text1!($kind, $text)),*));
+ let text2 = range!(concat!($(text2!($kind, $text)),*));
+ let (_i, _j) = (&mut 0, &mut 0);
+ #[allow(unused_macro_rules)]
+ macro_rules! range {
+ (Insert, $s:literal) => {
+ Diff::Insert(range(text2.doc, _j, $s))
+ };
+ (Delete, $s:literal) => {
+ Diff::Delete(range(text1.doc, _i, $s))
+ };
+ (Equal, $s:literal) => {
+ Diff::Equal(range(text1.doc, _i, $s), range(text2.doc, _j, $s))
+ };
+ }
+ Solution {
+ text1,
+ text2,
+ diffs: vec![$(range!($kind, $text)),*],
+ }
+ }};
+}
+
+fn range<'a>(doc: &'a [char], offset: &mut usize, text: &str) -> Range<'a> {
+ let len = text.chars().count();
+ let range = Range {
+ doc,
+ offset: *offset,
+ len,
+ };
+ *offset += len;
+ range
+}
+
+macro_rules! assert_diffs {
+ ([$($kind:ident($text:literal)),* $(,)?], $solution:ident, $msg:expr $(,)?) => {
+ let expected = &[$(Chunk::$kind($text)),*];
+ assert!(
+ same_diffs(expected, &$solution.diffs),
+ concat!($msg, "\nexpected={:#?}\nactual={:#?}"),
+ expected, $solution.diffs,
+ );
+ };
+}
+
+fn same_diffs(expected: &[Chunk], actual: &[Diff]) -> bool {
+ fn eq(expected: &str, actual: &Range) -> bool {
+ expected.chars().eq(slice(*actual).iter().copied())
+ }
+
+ expected.len() == actual.len()
+ && expected.iter().zip(actual).all(|pair| match pair {
+ (Chunk::Insert(expected), Diff::Insert(actual)) => eq(expected, actual),
+ (Chunk::Delete(expected), Diff::Delete(actual)) => eq(expected, actual),
+ (Chunk::Equal(expected), Diff::Equal(actual1, actual2)) => {
+ eq(expected, actual1) && eq(expected, actual2)
+ }
+ (_, _) => false,
+ })
+}
+
+#[test]
+fn test_common_prefix() {
+ let text1 = range!("abc");
+ let text2 = range!("xyz");
+ assert_eq!(0, common_prefix(text1, text2), "Null case");
+
+ let text1 = range!("1234abcdef");
+ let text2 = range!("1234xyz");
+ assert_eq!(4, common_prefix(text1, text2), "Non-null case");
+
+ let text1 = range!("1234");
+ let text2 = range!("1234xyz");
+ assert_eq!(4, common_prefix(text1, text2), "Whole case");
+}
+
+#[test]
+fn test_common_suffix() {
+ let text1 = range!("abc");
+ let text2 = range!("xyz");
+ assert_eq!(0, common_suffix(text1, text2), "Null case");
+
+ let text1 = range!("abcdef1234");
+ let text2 = range!("xyz1234");
+ assert_eq!(4, common_suffix(text1, text2), "Non-null case");
+
+ let text1 = range!("1234");
+ let text2 = range!("xyz1234");
+ assert_eq!(4, common_suffix(text1, text2), "Whole case");
+}
+
+#[test]
+fn test_common_overlap() {
+ let text1 = Range::empty();
+ let text2 = range!("abcd");
+ assert_eq!(0, common_overlap(text1, text2), "Null case");
+
+ let text1 = range!("abc");
+ let text2 = range!("abcd");
+ assert_eq!(3, common_overlap(text1, text2), "Whole case");
+
+ let text1 = range!("123456");
+ let text2 = range!("abcd");
+ assert_eq!(0, common_overlap(text1, text2), "No overlap");
+
+ let text1 = range!("123456xxx");
+ let text2 = range!("xxxabcd");
+ assert_eq!(3, common_overlap(text1, text2), "Overlap");
+
+ // Some overly clever languages (C#) may treat ligatures as equal to their
+ // component letters. E.g. U+FB01 == 'fi'
+ let text1 = range!("fi");
+ let text2 = range!("\u{fb01}i");
+ assert_eq!(0, common_overlap(text1, text2), "Unicode");
+}
+
+#[test]
+fn test_cleanup_merge() {
+ let mut solution = diff_list![];
+ cleanup_merge(&mut solution);
+ assert_diffs!([], solution, "Null case");
+
+ let mut solution = diff_list![Equal("a"), Delete("b"), Insert("c")];
+ cleanup_merge(&mut solution);
+ assert_diffs!(
+ [Equal("a"), Delete("b"), Insert("c")],
+ solution,
+ "No change case",
+ );
+
+ let mut solution = diff_list![Equal("a"), Equal("b"), Equal("c")];
+ cleanup_merge(&mut solution);
+ assert_diffs!([Equal("abc")], solution, "Merge equalities");
+
+ let mut solution = diff_list![Delete("a"), Delete("b"), Delete("c")];
+ cleanup_merge(&mut solution);
+ assert_diffs!([Delete("abc")], solution, "Merge deletions");
+
+ let mut solution = diff_list![Insert("a"), Insert("b"), Insert("c")];
+ cleanup_merge(&mut solution);
+ assert_diffs!([Insert("abc")], solution, "Merge insertions");
+
+ let mut solution = diff_list![
+ Delete("a"),
+ Insert("b"),
+ Delete("c"),
+ Insert("d"),
+ Equal("e"),
+ Equal("f"),
+ ];
+ cleanup_merge(&mut solution);
+ assert_diffs!(
+ [Delete("ac"), Insert("bd"), Equal("ef")],
+ solution,
+ "Merge interweave",
+ );
+
+ let mut solution = diff_list![Delete("a"), Insert("abc"), Delete("dc")];
+ cleanup_merge(&mut solution);
+ assert_diffs!(
+ [Equal("a"), Delete("d"), Insert("b"), Equal("c")],
+ solution,
+ "Prefix and suffix detection",
+ );
+
+ let mut solution = diff_list![
+ Equal("x"),
+ Delete("a"),
+ Insert("abc"),
+ Delete("dc"),
+ Equal("y"),
+ ];
+ cleanup_merge(&mut solution);
+ assert_diffs!(
+ [Equal("xa"), Delete("d"), Insert("b"), Equal("cy")],
+ solution,
+ "Prefix and suffix detection with equalities",
+ );
+
+ let mut solution = diff_list![Equal("a"), Insert("ba"), Equal("c")];
+ cleanup_merge(&mut solution);
+ assert_diffs!([Insert("ab"), Equal("ac")], solution, "Slide edit left");
+
+ let mut solution = diff_list![Equal("c"), Insert("ab"), Equal("a")];
+ cleanup_merge(&mut solution);
+ assert_diffs!([Equal("ca"), Insert("ba")], solution, "Slide edit right");
+
+ let mut solution = diff_list![
+ Equal("a"),
+ Delete("b"),
+ Equal("c"),
+ Delete("ac"),
+ Equal("x"),
+ ];
+ cleanup_merge(&mut solution);
+ assert_diffs!(
+ [Delete("abc"), Equal("acx")],
+ solution,
+ "Slide edit left recursive",
+ );
+
+ let mut solution = diff_list![
+ Equal("x"),
+ Delete("ca"),
+ Equal("c"),
+ Delete("b"),
+ Equal("a"),
+ ];
+ cleanup_merge(&mut solution);
+ assert_diffs!(
+ [Equal("xca"), Delete("cba")],
+ solution,
+ "Slide edit right recursive",
+ );
+
+ let mut solution = diff_list![Delete("b"), Insert("ab"), Equal("c")];
+ cleanup_merge(&mut solution);
+ assert_diffs!([Insert("a"), Equal("bc")], solution, "Empty range");
+
+ let mut solution = diff_list![Equal(""), Insert("a"), Equal("b")];
+ cleanup_merge(&mut solution);
+ assert_diffs!([Insert("a"), Equal("b")], solution, "Empty equality");
+}
+
+#[test]
+fn test_cleanup_semantic_lossless() {
+ let mut solution = diff_list![];
+ cleanup_semantic_lossless(&mut solution);
+ assert_diffs!([], solution, "Null case");
+
+ let mut solution = diff_list![
+ Equal("AAA\r\n\r\nBBB"),
+ Insert("\r\nDDD\r\n\r\nBBB"),
+ Equal("\r\nEEE"),
+ ];
+ cleanup_semantic_lossless(&mut solution);
+ assert_diffs!(
+ [
+ Equal("AAA\r\n\r\n"),
+ Insert("BBB\r\nDDD\r\n\r\n"),
+ Equal("BBB\r\nEEE"),
+ ],
+ solution,
+ "Blank lines",
+ );
+
+ let mut solution = diff_list![Equal("AAA\r\nBBB"), Insert(" DDD\r\nBBB"), Equal(" EEE")];
+ cleanup_semantic_lossless(&mut solution);
+ assert_diffs!(
+ [Equal("AAA\r\n"), Insert("BBB DDD\r\n"), Equal("BBB EEE")],
+ solution,
+ "Line boundaries",
+ );
+
+ let mut solution = diff_list![Equal("The c"), Insert("ow and the c"), Equal("at.")];
+ cleanup_semantic_lossless(&mut solution);
+ assert_diffs!(
+ [Equal("The "), Insert("cow and the "), Equal("cat.")],
+ solution,
+ "Word boundaries",
+ );
+
+ let mut solution = diff_list![Equal("The-c"), Insert("ow-and-the-c"), Equal("at.")];
+ cleanup_semantic_lossless(&mut solution);
+ assert_diffs!(
+ [Equal("The-"), Insert("cow-and-the-"), Equal("cat.")],
+ solution,
+ "Alphanumeric boundaries",
+ );
+
+ let mut solution = diff_list![Equal("a"), Delete("a"), Equal("ax")];
+ cleanup_semantic_lossless(&mut solution);
+ assert_diffs!([Delete("a"), Equal("aax")], solution, "Hitting the start");
+
+ let mut solution = diff_list![Equal("xa"), Delete("a"), Equal("a")];
+ cleanup_semantic_lossless(&mut solution);
+ assert_diffs!([Equal("xaa"), Delete("a")], solution, "Hitting the end");
+
+ let mut solution = diff_list![Equal("The xxx. The "), Insert("zzz. The "), Equal("yyy.")];
+ cleanup_semantic_lossless(&mut solution);
+ assert_diffs!(
+ [Equal("The xxx."), Insert(" The zzz."), Equal(" The yyy.")],
+ solution,
+ "Sentence boundaries",
+ );
+}
+
+#[test]
+fn test_cleanup_semantic() {
+ let mut solution = diff_list![];
+ cleanup_semantic(&mut solution);
+ assert_diffs!([], solution, "Null case");
+
+ let mut solution = diff_list![Delete("ab"), Insert("cd"), Equal("12"), Delete("e")];
+ cleanup_semantic(&mut solution);
+ assert_diffs!(
+ [Delete("ab"), Insert("cd"), Equal("12"), Delete("e")],
+ solution,
+ "No elimination #1",
+ );
+
+ let mut solution = diff_list![Delete("abc"), Insert("ABC"), Equal("1234"), Delete("wxyz")];
+ cleanup_semantic(&mut solution);
+ assert_diffs!(
+ [Delete("abc"), Insert("ABC"), Equal("1234"), Delete("wxyz")],
+ solution,
+ "No elimination #2",
+ );
+
+ let mut solution = diff_list![Delete("a"), Equal("b"), Delete("c")];
+ cleanup_semantic(&mut solution);
+ assert_diffs!([Delete("abc"), Insert("b")], solution, "Simple elimination",);
+
+ let mut solution = diff_list![
+ Delete("ab"),
+ Equal("cd"),
+ Delete("e"),
+ Equal("f"),
+ Insert("g"),
+ ];
+ cleanup_semantic(&mut solution);
+ assert_diffs!(
+ [Delete("abcdef"), Insert("cdfg")],
+ solution,
+ "Backpass elimination",
+ );
+
+ let mut solution = diff_list![
+ Insert("1"),
+ Equal("A"),
+ Delete("B"),
+ Insert("2"),
+ Equal("_"),
+ Insert("1"),
+ Equal("A"),
+ Delete("B"),
+ Insert("2"),
+ ];
+ cleanup_semantic(&mut solution);
+ assert_diffs!(
+ [Delete("AB_AB"), Insert("1A2_1A2")],
+ solution,
+ "Multiple elimination",
+ );
+
+ let mut solution = diff_list![Equal("The c"), Delete("ow and the c"), Equal("at.")];
+ cleanup_semantic(&mut solution);
+ assert_diffs!(
+ [Equal("The "), Delete("cow and the "), Equal("cat.")],
+ solution,
+ "Word boundaries",
+ );
+
+ let mut solution = diff_list![Delete("abcxx"), Insert("xxdef")];
+ cleanup_semantic(&mut solution);
+ assert_diffs!(
+ [Delete("abcxx"), Insert("xxdef")],
+ solution,
+ "No overlap elimination",
+ );
+
+ let mut solution = diff_list![Delete("abcxxx"), Insert("xxxdef")];
+ cleanup_semantic(&mut solution);
+ assert_diffs!(
+ [Delete("abc"), Equal("xxx"), Insert("def")],
+ solution,
+ "Overlap elimination",
+ );
+
+ let mut solution = diff_list![Delete("xxxabc"), Insert("defxxx")];
+ cleanup_semantic(&mut solution);
+ assert_diffs!(
+ [Insert("def"), Equal("xxx"), Delete("abc")],
+ solution,
+ "Reverse overlap elimination",
+ );
+
+ let mut solution = diff_list![
+ Delete("abcd1212"),
+ Insert("1212efghi"),
+ Equal("----"),
+ Delete("A3"),
+ Insert("3BC"),
+ ];
+ cleanup_semantic(&mut solution);
+ assert_diffs!(
+ [
+ Delete("abcd"),
+ Equal("1212"),
+ Insert("efghi"),
+ Equal("----"),
+ Delete("A"),
+ Equal("3"),
+ Insert("BC"),
+ ],
+ solution,
+ "Two overlap eliminations",
+ );
+}
+
+#[test]
+fn test_bisect() {
+ let text1 = range!("cat");
+ let text2 = range!("map");
+ let solution = Solution {
+ text1,
+ text2,
+ diffs: bisect(text1, text2),
+ };
+ assert_diffs!(
+ [
+ Delete("c"),
+ Insert("m"),
+ Equal("a"),
+ Delete("t"),
+ Insert("p"),
+ ],
+ solution,
+ "Normal",
+ );
+}
+
+#[test]
+fn test_main() {
+ let solution = main(Range::empty(), Range::empty());
+ assert_diffs!([], solution, "Null case");
+
+ let solution = main(range!("abc"), range!("abc"));
+ assert_diffs!([Equal("abc")], solution, "Equality");
+
+ let solution = main(range!("abc"), range!("ab123c"));
+ assert_diffs!(
+ [Equal("ab"), Insert("123"), Equal("c")],
+ solution,
+ "Simple insertion",
+ );
+
+ let solution = main(range!("a123bc"), range!("abc"));
+ assert_diffs!(
+ [Equal("a"), Delete("123"), Equal("bc")],
+ solution,
+ "Simple deletion",
+ );
+
+ let solution = main(range!("abc"), range!("a123b456c"));
+ assert_diffs!(
+ [
+ Equal("a"),
+ Insert("123"),
+ Equal("b"),
+ Insert("456"),
+ Equal("c"),
+ ],
+ solution,
+ "Two insertions",
+ );
+
+ let solution = main(range!("a123b456c"), range!("abc"));
+ assert_diffs!(
+ [
+ Equal("a"),
+ Delete("123"),
+ Equal("b"),
+ Delete("456"),
+ Equal("c"),
+ ],
+ solution,
+ "Two deletions",
+ );
+
+ let solution = main(range!("a"), range!("b"));
+ assert_diffs!([Delete("a"), Insert("b")], solution, "Simple case #1");
+
+ let solution = main(
+ range!("Apples are a fruit."),
+ range!("Bananas are also fruit."),
+ );
+ assert_diffs!(
+ [
+ Delete("Apple"),
+ Insert("Banana"),
+ Equal("s are a"),
+ Insert("lso"),
+ Equal(" fruit."),
+ ],
+ solution,
+ "Simple case #2",
+ );
+
+ let solution = main(range!("ax\t"), range!("\u{0680}x\000"));
+ assert_diffs!(
+ [
+ Delete("a"),
+ Insert("\u{0680}"),
+ Equal("x"),
+ Delete("\t"),
+ Insert("\000"),
+ ],
+ solution,
+ "Simple case #3",
+ );
+
+ let solution = main(range!("1ayb2"), range!("abxab"));
+ assert_diffs!(
+ [
+ Delete("1"),
+ Equal("a"),
+ Delete("y"),
+ Equal("b"),
+ Delete("2"),
+ Insert("xab"),
+ ],
+ solution,
+ "Overlap #1",
+ );
+
+ let solution = main(range!("abcy"), range!("xaxcxabc"));
+ assert_diffs!(
+ [Insert("xaxcx"), Equal("abc"), Delete("y")],
+ solution,
+ "Overlap #2",
+ );
+
+ let solution = main(
+ range!("ABCDa=bcd=efghijklmnopqrsEFGHIJKLMNOefg"),
+ range!("a-bcd-efghijklmnopqrs"),
+ );
+ assert_diffs!(
+ [
+ Delete("ABCD"),
+ Equal("a"),
+ Delete("="),
+ Insert("-"),
+ Equal("bcd"),
+ Delete("="),
+ Insert("-"),
+ Equal("efghijklmnopqrs"),
+ Delete("EFGHIJKLMNOefg"),
+ ],
+ solution,
+ "Overlap #3",
+ );
+
+ let solution = main(
+ range!("a [[Pennsylvania]] and [[New"),
+ range!(" and [[Pennsylvania]]"),
+ );
+ assert_diffs!(
+ [
+ Insert(" "),
+ Equal("a"),
+ Insert("nd"),
+ Equal(" [[Pennsylvania]]"),
+ Delete(" and [[New"),
+ ],
+ solution,
+ "Large equality",
+ );
+}
diff --git a/tests/test.rs b/tests/test.rs
new file mode 100644
index 0000000..7debb05
--- /dev/null
+++ b/tests/test.rs
@@ -0,0 +1,52 @@
+// Upstream diff-match-patch's test suite is imported as unit tests in
+// src/tests.rs, as they test APIs which are private in the Rust implementation.
+//
+// This directory is for Rust-specific integration tests and regression tests.
+
+#![allow(clippy::non_ascii_literal)]
+
+use dissimilar::{diff, Chunk};
+
+#[test]
+fn test_unicode() {
+ // Unicode snowman and unicode comet have the same first two bytes. A
+ // byte-based diff would produce a 2-byte Equal followed by 1-byte Delete
+ // and Insert.
+ let snowman = "\u{2603}";
+ let comet = "\u{2604}";
+ assert_eq!(snowman.as_bytes()[..2], comet.as_bytes()[..2]);
+
+ let d = diff(snowman, comet);
+ assert_eq!(d, vec![Chunk::Delete(snowman), Chunk::Insert(comet)]);
+}
+
+#[test]
+fn test_issue9() {
+ let a = "[乀丁abcd一]";
+ let b = "[一abcd丁]";
+ let d = diff(a, b);
+ assert_eq!(
+ d,
+ vec![
+ Chunk::Equal("["),
+ Chunk::Delete("乀丁"),
+ Chunk::Insert("一"),
+ Chunk::Equal("abcd"),
+ Chunk::Delete("一"),
+ Chunk::Insert("丁"),
+ Chunk::Equal("]"),
+ ],
+ );
+}
+
+#[test]
+fn test_issue15() {
+ let a = "A のダ";
+ let b = "A ダ";
+ let d = diff(a, b);
+
+ assert_eq!(
+ d,
+ vec![Chunk::Equal("A "), Chunk::Delete("の"), Chunk::Equal("ダ")],
+ );
+}