text-metrics-0.3.0/Data/0000755000000000000000000000000013114303023013167 5ustar0000000000000000text-metrics-0.3.0/Data/Text/0000755000000000000000000000000013117722611014126 5ustar0000000000000000text-metrics-0.3.0/bench-memory/0000755000000000000000000000000013117722611014716 5ustar0000000000000000text-metrics-0.3.0/bench-speed/0000755000000000000000000000000013117722611014506 5ustar0000000000000000text-metrics-0.3.0/tests/0000755000000000000000000000000013117722611013473 5ustar0000000000000000text-metrics-0.3.0/Data/Text/Metrics.hs0000644000000000000000000003121113117722611016066 0ustar0000000000000000-- | -- Module : Data.Text.Metrics -- Copyright : © 2016–2017 Mark Karpov -- License : BSD 3 clause -- -- Maintainer : Mark Karpov -- Stability : experimental -- Portability : portable -- -- The module provides efficient implementations of various strings metric -- algorithms. It works with strict 'Text' values. -- -- __Note__: before version /0.3.0/ the package used C implementations of -- the algorithms under the hood. Beginning from version /0.3.0/, the -- implementations are written in Haskell while staying almost as fast, see: -- -- {-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE MultiWayIf #-} module Data.Text.Metrics ( -- * Levenshtein variants levenshtein , levenshteinNorm , damerauLevenshtein , damerauLevenshteinNorm -- * Treating inputs like sets , overlap , jaccard -- * Other , hamming , jaro , jaroWinkler ) where import Control.Monad import Control.Monad.ST import Data.Map.Strict (Map) import Data.Ratio import Data.Text import GHC.Exts (inline) import qualified Data.Map.Strict as M import qualified Data.Text as T import qualified Data.Text.Unsafe as TU import qualified Data.Vector.Unboxed.Mutable as VUM #if !MIN_VERSION_base(4,8,0) import Control.Applicative #endif ---------------------------------------------------------------------------- -- Levenshtein variants -- | Return Levenshtein distance between two 'Text' values. Classic -- Levenshtein distance between two strings is the minimal number of -- operations necessary to transform one string into another. For -- Levenshtein distance allowed operations are: deletion, insertion, and -- substitution. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned -- 'Data.Numeric.Natural'. levenshtein :: Text -> Text -> Int levenshtein a b = fst (levenshtein_ a b) -- | Return normalized Levenshtein distance between two 'Text' values. -- Result is a non-negative rational number (represented as @'Ratio' -- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the -- strings, while 1 means exact match. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. levenshteinNorm :: Text -> Text -> Ratio Int levenshteinNorm = norm levenshtein_ -- | An internal helper, returns Levenshtein distance as the first element -- of the tuple and max length of the two inputs as the second element of -- the tuple. levenshtein_ :: Text -> Text -> (Int, Int) levenshtein_ a b | T.null a = (lenb, lenm) | T.null b = (lena, lenm) | otherwise = runST $ do let v_len = lenb + 1 v <- VUM.unsafeNew (v_len * 2) let gov !i = when (i < v_len) $ do VUM.unsafeWrite v i i gov (i + 1) goi !i !na !v0 !v1 = do let !(TU.Iter ai da) = TU.iter a na goj !j !nb = when (j < lenb) $ do let !(TU.Iter bj db) = TU.iter b nb cost = if ai == bj then 0 else 1 x <- (+ 1) <$> VUM.unsafeRead v (v1 + j) y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1) z <- (+ cost) <$> VUM.unsafeRead v (v0 + j) VUM.unsafeWrite v (v1 + j + 1) (min x (min y z)) goj (j + 1) (nb + db) when (i < lena) $ do VUM.unsafeWrite v v1 (i + 1) goj 0 0 goi (i + 1) (na + da) v1 v0 gov 0 goi 0 0 0 v_len ld <- VUM.unsafeRead v (lenb + if even lena then 0 else v_len) return (ld, lenm) where lena = T.length a lenb = T.length b lenm = max lena lenb {-# INLINE levenshtein_ #-} -- | Return Damerau-Levenshtein distance between two 'Text' values. The -- function works like 'levenshtein', but the collection of allowed -- operations also includes transposition of two /adjacent/ characters. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned -- 'Data.Numeric.Natural'. damerauLevenshtein :: Text -> Text -> Int damerauLevenshtein a b = fst (damerauLevenshtein_ a b) -- | Return normalized Damerau-Levenshtein distance between two 'Text' -- values. 0 signifies no similarity between the strings, while 1 means -- exact match. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. damerauLevenshteinNorm :: Text -> Text -> Ratio Int damerauLevenshteinNorm = norm damerauLevenshtein_ -- | An internal helper, returns Damerau-Levenshtein distance as the first -- element of the tuple and max length of the two inputs as the second -- element of the tuple. damerauLevenshtein_ :: Text -> Text -> (Int, Int) damerauLevenshtein_ a b | T.null a = (lenb, lenm) | T.null b = (lena, lenm) | otherwise = runST $ do let v_len = lenb + 1 v <- VUM.unsafeNew (v_len * 3) let gov !i = when (i < v_len) $ do VUM.unsafeWrite v i i gov (i + 1) goi !i !na !ai_1 !v0 !v1 !v2 = do let !(TU.Iter ai da) = TU.iter a na goj !j !nb !bj_1 = when (j < lenb) $ do let !(TU.Iter bj db) = TU.iter b nb cost = if ai == bj then 0 else 1 x <- (+ 1) <$> VUM.unsafeRead v (v1 + j) y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1) z <- (+ cost) <$> VUM.unsafeRead v (v0 + j) let g = min x (min y z) val <- (+ cost) <$> VUM.unsafeRead v (v2 + j - 1) VUM.unsafeWrite v (v1 + j + 1) $ if i > 0 && j > 0 && ai == bj_1 && ai_1 == bj && val < g then val else g goj (j + 1) (nb + db) bj when (i < lena) $ do VUM.unsafeWrite v v1 (i + 1) goj 0 0 'a' goi (i + 1) (na + da) ai v1 v2 v0 gov 0 goi 0 0 'a' 0 v_len (v_len * 2) ld <- VUM.unsafeRead v (lenb + (lena `mod` 3) * v_len) return (ld, lenm) where lena = T.length a lenb = T.length b lenm = max lena lenb {-# INLINE damerauLevenshtein_ #-} ---------------------------------------------------------------------------- -- Treating inputs like sets -- | Return overlap coefficient for two 'Text' values. Returned value is in -- the range from 0 (no similarity) to 1 (exact match). Return 1 if both -- 'Text' values are empty. -- -- See also: . -- -- @since 0.3.0 overlap :: Text -> Text -> Ratio Int overlap a b = if d == 0 then 1 % 1 else intersectionSize (mkTextMap a) (mkTextMap b) % d where d = min (T.length a) (T.length b) -- | Return Jaccard similarity coefficient for two 'Text' values. Returned -- value is in the range from 0 (no similarity) to 1 (exact match). Return 1 -- if both -- -- See also: -- -- @since 0.3.0 jaccard :: Text -> Text -> Ratio Int jaccard a b = if d == 0 then 1 % 1 else intersectionSize ma mb % d where ma = mkTextMap a mb = mkTextMap b d = unionSize ma mb -- | Make a map from 'Char' to 'Int' representing how many times the 'Char' -- appears in the input 'Text'. mkTextMap :: Text -> Map Char Int mkTextMap = T.foldl' f M.empty where f m ch = M.insertWith (+) ch 1 m {-# INLINE mkTextMap #-} -- | Return intersection size between two 'Text'-maps. intersectionSize :: Map Char Int -> Map Char Int -> Int intersectionSize a b = M.foldl' (+) 0 (M.intersectionWith min a b) {-# INLINE intersectionSize #-} -- | Return union size between two 'Text'-maps. unionSize :: Map Char Int -> Map Char Int -> Int unionSize a b = M.foldl' (+) 0 (M.unionWith max a b) {-# INLINE unionSize #-} ---------------------------------------------------------------------------- -- Other -- | /O(n)/ Return Hamming distance between two 'Text' values. Hamming -- distance is defined as the number of positions at which the corresponding -- symbols are different. The input 'Text' values should be of equal length -- or 'Nothing' will be returned. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned @'Maybe' -- 'Data.Numeric.Natural'@. hamming :: Text -> Text -> Maybe Int hamming a b = if T.length a == T.length b then Just (go 0 0 0) else Nothing where go !na !nb !r = let !(TU.Iter cha da) = TU.iter a na !(TU.Iter chb db) = TU.iter b nb in if | na == len -> r | cha /= chb -> go (na + da) (nb + db) (r + 1) | otherwise -> go (na + da) (nb + db) r len = TU.lengthWord16 a -- | Return Jaro distance between two 'Text' values. Returned value is in -- the range from 0 (no similarity) to 1 (exact match). -- -- While the algorithm is pretty clear for artificial examples (like those -- from the linked Wikipedia article), for /arbitrary/ strings, it may be -- hard to decide which of two strings should be considered as one having -- “reference” order of characters (order of matching characters in an -- essential part of the definition of the algorithm). This makes us -- consider the first string the “reference” string (with correct order of -- characters). Thus generally, -- -- > jaro a b ≠ jaro b a -- -- This asymmetry can be found in all implementations of the algorithm on -- the internet, AFAIK. -- -- See also: -- -- @since 0.2.0 -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. jaro :: Text -> Text -> Ratio Int jaro a b = if T.null a || T.null b then 0 % 1 else runST $ do let lena = T.length a lenb = T.length b d = if lena >= 2 && lenb >= 2 then max lena lenb `quot` 2 - 1 else 0 v <- VUM.replicate lenb (0 :: Int) r <- VUM.replicate 3 (0 :: Int) -- tj, m, t let goi !i !na !fromb = do let !(TU.Iter ai da) = TU.iter a na (from, fromb') = if i >= d then (i - d, fromb + TU.iter_ b fromb) else (0, 0) to = min (i + d + 1) lenb goj !j !nb = when (j < to) $ do let !(TU.Iter bj db) = TU.iter b nb used <- (== 1) <$> VUM.unsafeRead v j if not used && ai == bj then do tj <- VUM.unsafeRead r 0 if j < tj then VUM.unsafeModify r (+ 1) 2 else VUM.unsafeWrite r 0 j VUM.unsafeWrite v j 1 VUM.unsafeModify r (+ 1) 1 else goj (j + 1) (nb + db) when (i < lena) $ do goj from fromb goi (i + 1) (na + da) fromb' goi 0 0 0 m <- VUM.unsafeRead r 1 t <- VUM.unsafeRead r 2 return $ if m == 0 then 0 % 1 else ((m % lena) + (m % lenb) + ((m - t) % m)) / 3 -- | Return Jaro-Winkler distance between two 'Text' values. Returned value -- is in range from 0 (no similarity) to 1 (exact match). -- -- See also: -- -- @since 0.2.0 -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. jaroWinkler :: Text -> Text -> Ratio Int jaroWinkler a b = dj + (1 % 10) * l * (1 - dj) where dj = inline (jaro a b) l = fromIntegral (commonPrefix a b) -- | Return length of common prefix two 'Text' values have. commonPrefix :: Text -> Text -> Int commonPrefix a b = go 0 0 0 where go !na !nb !r = let !(TU.Iter cha da) = TU.iter a na !(TU.Iter chb db) = TU.iter b nb in if | na == lena -> r | nb == lenb -> r | cha == chb -> go (na + da) (nb + db) (r + 1) | otherwise -> r lena = TU.lengthWord16 a lenb = TU.lengthWord16 b {-# INLINE commonPrefix #-} ---------------------------------------------------------------------------- -- Helpers norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int norm f a b = let (r, l) = f a b in if r == 0 then 1 % 1 else 1 % 1 - r % l {-# INLINE norm #-} text-metrics-0.3.0/tests/Main.hs0000644000000000000000000001502713117722611014720 0ustar0000000000000000{-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Main (main) where import Data.Ratio import Data.Text (Text) import Data.Text.Metrics import Test.Hspec import Test.QuickCheck import qualified Data.Text as T #if !MIN_VERSION_base(4,8,0) import Control.Applicative #endif instance Arbitrary Text where arbitrary = T.pack <$> arbitrary main :: IO () main = hspec spec spec :: Spec spec = do describe "levenshtein" $ do testSwap levenshtein context "with concrete examples" $ do testPair levenshtein "kitten" "sitting" 3 testPair levenshtein "cake" "drake" 2 testPair levenshtein "saturday" "sunday" 3 testPair levenshtein "red" "wax" 3 #if __GLASGOW_HASKELL__ >= 710 testPair levenshtein "a😀c" "abc" 1 #endif testPair levenshtein "lucky" "lucky" 0 testPair levenshtein "" "" 0 describe "levenshteinNorm" $ do testSwap levenshteinNorm testPair levenshteinNorm "kitten" "sitting" (4 % 7) testPair levenshteinNorm "cake" "drake" (3 % 5) testPair levenshteinNorm "saturday" "sunday" (5 % 8) testPair levenshteinNorm "red" "wax" (0 % 1) #if __GLASGOW_HASKELL__ >= 710 testPair levenshteinNorm "a😀c" "abc" (2 % 3) #endif testPair levenshteinNorm "lucky" "lucky" (1 % 1) testPair levenshteinNorm "" "" (1 % 1) describe "damerauLevenshtein" $ do testSwap damerauLevenshtein testPair damerauLevenshtein "veryvery long" "very long" 4 testPair damerauLevenshtein "thing" "think" 1 testPair damerauLevenshtein "nose" "ones" 2 testPair damerauLevenshtein "thing" "sign" 3 testPair damerauLevenshtein "red" "wax" 3 #if __GLASGOW_HASKELL__ >= 710 testPair damerauLevenshtein "a😀c" "abc" 1 #endif testPair damerauLevenshtein "lucky" "lucky" 0 testPair damerauLevenshtein "" "" 0 describe "damerauLevenshteinNorm" $ do testSwap damerauLevenshteinNorm testPair damerauLevenshteinNorm "veryvery long" "very long" (9 % 13) testPair damerauLevenshteinNorm "thing" "think" (4 % 5) testPair damerauLevenshteinNorm "nose" "ones" (1 % 2) testPair damerauLevenshteinNorm "thing" "sign" (2 % 5) testPair damerauLevenshteinNorm "red" "wax" (0 % 1) #if __GLASGOW_HASKELL__ >= 710 testPair damerauLevenshteinNorm "a😀c" "abc" (2 % 3) #endif testPair damerauLevenshteinNorm "lucky" "lucky" (1 % 1) testPair damerauLevenshteinNorm "" "" (1 % 1) describe "hamming" $ do testSwap hamming testPair hamming "karolin" "kathrin" (Just 3) testPair hamming "karolin" "kerstin" (Just 3) testPair hamming "1011101" "1001001" (Just 2) testPair hamming "2173896" "2233796" (Just 3) testPair hamming "toned" "roses" (Just 3) testPair hamming "red" "wax" (Just 3) #if __GLASGOW_HASKELL__ >= 710 testPair hamming "a😀c" "abc" (Just 1) #endif testPair hamming "lucky" "lucky" (Just 0) testPair hamming "" "" (Just 0) testPair hamming "small" "big" Nothing describe "jaro" $ do testPair jaro "aa" "a" (5 % 6) testPair jaro "a" "aa" (5 % 6) testPair jaro "martha" "marhta" (17 % 18) testPair jaro "marhta" "martha" (17 % 18) testPair jaro "dwayne" "duane" (37 % 45) testPair jaro "duane" "dwayne" (37 % 45) testPair jaro "dixon" "dicksonx" (23 % 30) testPair jaro "dicksonx" "dixon" (23 % 30) testPair jaro "jones" "johnson" (83 % 105) testPair jaro "johnson" "jones" (83 % 105) testPair jaro "brain" "brian" (14 % 15) testPair jaro "brian" "brain" (14 % 15) testPair jaro "five" "ten" (0 % 1) testPair jaro "ten" "five" (0 % 1) testPair jaro "lucky" "lucky" (1 % 1) #if __GLASGOW_HASKELL__ >= 710 testPair jaro "a😀c" "abc" (7 % 9) #endif testPair jaro "" "" (0 % 1) describe "jaroWinkler" $ do testPair jaroWinkler "aa" "a" (17 % 20) testPair jaroWinkler "a" "aa" (17 % 20) testPair jaroWinkler "martha" "marhta" (173 % 180) testPair jaroWinkler "marhta" "martha" (173 % 180) testPair jaroWinkler "dwayne" "duane" (21 % 25) testPair jaroWinkler "duane" "dwayne" (21 % 25) testPair jaroWinkler "dixon" "dicksonx" (61 % 75) testPair jaroWinkler "dicksonx" "dixon" (61 % 75) testPair jaroWinkler "jones" "johnson" (437 % 525) testPair jaroWinkler "johnson" "jones" (437 % 525) testPair jaroWinkler "brain" "brian" (71 % 75) testPair jaroWinkler "brian" "brain" (71 % 75) testPair jaroWinkler "five" "ten" (0 % 1) testPair jaroWinkler "ten" "five" (0 % 1) testPair jaroWinkler "lucky" "lucky" (1 % 1) #if __GLASGOW_HASKELL__ >= 710 testPair jaroWinkler "a😀c" "abc" (4 % 5) #endif testPair jaroWinkler "" "" (0 % 1) describe "overlap" $ do testSwap overlap testPair overlap "fly" "butterfly" (1 % 1) testPair overlap "night" "nacht" (3 % 5) testPair overlap "context" "contact" (5 % 7) testPair overlap "red" "wax" (0 % 1) #if __GLASGOW_HASKELL__ >= 710 testPair overlap "a😀c" "abc" (2 % 3) #endif testPair overlap "lucky" "lucky" (1 % 1) describe "jaccard" $ do testSwap jaccard testPair jaccard "xxx" "xyx" (1 % 2) testPair jaccard "night" "nacht" (3 % 7) testPair jaccard "context" "contact" (5 % 9) #if __GLASGOW_HASKELL__ >= 710 testPair overlap "a😀c" "abc" (2 % 3) #endif testPair jaccard "lucky" "lucky" (1 % 1) -- | Test that given function returns the same results when order of -- arguments is swapped. testSwap :: (Eq a, Show a) => (Text -> Text -> a) -> SpecWith () testSwap f = context "if we swap the arguments" $ it "produces the same result" $ property $ \a b -> f a b === f b a -- | Create spec for given metric function applying it to two 'Text' values -- and comparing the result with expected one. testPair :: (Eq a, Show a) => (Text -> Text -> a) -- ^ Function to test -> Text -- ^ First input -> Text -- ^ Second input -> a -- ^ Expected result -> SpecWith () testPair f a b r = it ("‘" ++ T.unpack a ++ "’ and ‘" ++ T.unpack b ++ "’") $ f a b `shouldBe` r text-metrics-0.3.0/bench-memory/Main.hs0000644000000000000000000000230113117722611016132 0ustar0000000000000000module Main (main) where import Control.DeepSeq import Control.Monad import Data.Text (Text) import Data.Text.Metrics import Weigh import qualified Data.Text as T main :: IO () main = mainWith $ do setColumns [Case, Allocated, GCs, Max] bmetric "levenshtein" levenshtein bmetric "levenshteinNorm" levenshteinNorm bmetric "damerauLevenshtein" damerauLevenshtein bmetric "damerauLevenshteinNorm" damerauLevenshteinNorm bmetric "overlap" overlap bmetric "jaccard" jaccard bmetric "hamming" hamming bmetric "jaro" jaro bmetric "jaroWinkler" jaroWinkler -- | Perform a series to measurements with the same metric function. bmetric :: NFData a => String -- ^ Name of the benchmark group -> (Text -> Text -> a) -- ^ The function to benchmark -> Weigh () bmetric name f = forM_ stdSeries $ \n -> func (name ++ "/" ++ show n) (uncurry f) (testData n, testData n) -- | The series of lengths to try with every function as part of 'btmetric'. stdSeries :: [Int] stdSeries = [5,10,20,40,80,160] testData :: Int -> Text testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z'] text-metrics-0.3.0/bench-speed/Main.hs0000644000000000000000000000213013117722611015722 0ustar0000000000000000module Main (main) where import Control.DeepSeq import Criterion.Main import Data.Text (Text) import Data.Text.Metrics import qualified Data.Text as T main :: IO () main = defaultMain [ btmetric "levenshtein" levenshtein , btmetric "levenshteinNorm" levenshteinNorm , btmetric "damerauLevenshtein" damerauLevenshtein , btmetric "damerauLevenshteinNorm" damerauLevenshteinNorm , btmetric "overlap" overlap , btmetric "jaccard" jaccard , btmetric "hamming" hamming , btmetric "jaro" jaro , btmetric "jaroWinkler" jaroWinkler ] -- | Produce benchmark group to test. btmetric :: NFData a => String -> (Text -> Text -> a) -> Benchmark btmetric name f = bgroup name (bs <$> stdSeries) where bs n = env (return (testData n, testData n)) (bench (show n) . nf (uncurry f)) -- | The series of lengths to try with every function as part of 'btmetric'. stdSeries :: [Int] stdSeries = [5,10,20,40,80,160] testData :: Int -> Text testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z'] text-metrics-0.3.0/CHANGELOG.md0000644000000000000000000000075613117722717014161 0ustar0000000000000000## Text Metrics 0.3.0 * All functions are now implemented in pure Haskell. * All functions return `Int` or `Ratio Int` instead of `Natural` and `Ratio Natural`. * Added `overlap` (returns overlap coefficient) and `jaccard` (returns Jaccard similarity coefficient). ## Text Metrics 0.2.0 * Made the `levenshtein`, `levenshteinNorm`, `damerauLevenshtein`, and `demerauLevenshtein` more efficient. * Added `jaro` and `jaroWinkler` functions. ## Text Metrics 0.1.0 * Initial release. text-metrics-0.3.0/README.md0000644000000000000000000000523213117722611013612 0ustar0000000000000000# Text Metrics [![License BSD3](https://img.shields.io/badge/license-BSD3-brightgreen.svg)](http://opensource.org/licenses/BSD-3-Clause) [![Hackage](https://img.shields.io/hackage/v/text-metrics.svg?style=flat)](https://hackage.haskell.org/package/text-metrics) [![Stackage Nightly](http://stackage.org/package/text-metrics/badge/nightly)](http://stackage.org/nightly/package/text-metrics) [![Stackage LTS](http://stackage.org/package/text-metrics/badge/lts)](http://stackage.org/lts/package/text-metrics) [![Build Status](https://travis-ci.org/mrkkrp/text-metrics.svg?branch=master)](https://travis-ci.org/mrkkrp/text-metrics) [![Coverage Status](https://coveralls.io/repos/mrkkrp/text-metrics/badge.svg?branch=master&service=github)](https://coveralls.io/github/mrkkrp/text-metrics?branch=master) The library provides efficient implementations of various strings metric algorithms. It works with strict `Text` values. The current version of the package implements: * [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) * [Normalized Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) * [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) * [Normalized Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) * [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) * [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) * [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) * [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient) * [Jaccard similarity coefficient](https://en.wikipedia.org/wiki/Jaccard_index) ## Comparison with the `edit-distance` package There is [`edit-distance`](https://hackage.haskell.org/package/edit-distance) package whose scope overlaps with the scope of this package. The differences are: * `edit-distance` allows to specify costs for every operation when calculating Levenshtein distance (insertion, deletion, substitution, and transposition). This is rarely needed though in real-world applications, IMO. * `edit-distance` only provides Levenshtein distance, `text-metrics` aims to provide implementations of most string metrics algorithms. * `edit-distance` works on `Strings`, while `text-metrics` works on strict `Text` values. ## Implementation Although we originally used C for speed, currently all functions are pure Haskell tuned for performance. See [this blog post](https://markkarpov.com/post/migrating-text-metrics.html) for more info. ## License Copyright © 2016–2017 Mark Karpov Distributed under BSD 3 clause license. text-metrics-0.3.0/LICENSE.md0000644000000000000000000000265313114303023013730 0ustar0000000000000000Copyright © 2016–2017 Mark Karpov All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name Mark Karpov nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. text-metrics-0.3.0/Setup.hs0000644000000000000000000000012713114303023013752 0ustar0000000000000000module Main (main) where import Distribution.Simple main :: IO () main = defaultMain text-metrics-0.3.0/text-metrics.cabal0000644000000000000000000000551013117734302015746 0ustar0000000000000000name: text-metrics version: 0.3.0 cabal-version: >= 1.10 tested-with: GHC==7.8.4, GHC==7.10.3, GHC==8.0.2, GHC==8.2.1 license: BSD3 license-file: LICENSE.md author: Mark Karpov maintainer: Mark Karpov homepage: https://github.com/mrkkrp/text-metrics bug-reports: https://github.com/mrkkrp/text-metrics/issues category: Text, Algorithms synopsis: Calculate various string metrics efficiently build-type: Simple description: Calculate various string metrics efficiently. extra-doc-files: CHANGELOG.md , README.md source-repository head type: git location: https://github.com/mrkkrp/text-metrics.git flag dev description: Turn on development settings. manual: True default: False library build-depends: base >= 4.7 && < 5.0 , containers >= 0.5.6.2 && < 0.6 , text >= 0.2 && < 1.3 , vector >= 0.11 && < 0.13 exposed-modules: Data.Text.Metrics if flag(dev) ghc-options: -Wall -Werror else ghc-options: -O2 -Wall default-language: Haskell2010 test-suite tests main-is: Main.hs hs-source-dirs: tests type: exitcode-stdio-1.0 build-depends: QuickCheck >= 2.8 && < 3.0 , base >= 4.7 && < 5.0 , hspec >= 2.0 && < 3.0 , text >= 0.2 && < 1.3 , text-metrics if flag(dev) ghc-options: -Wall -Werror else ghc-options: -O2 -Wall default-language: Haskell2010 benchmark bench-speed main-is: Main.hs hs-source-dirs: bench-speed type: exitcode-stdio-1.0 build-depends: base >= 4.7 && < 5.0 , criterion >= 0.6.2.1 && < 1.3 , deepseq >= 1.4 && < 1.5 , text >= 0.2 && < 1.3 , text-metrics if flag(dev) ghc-options: -O2 -Wall -Werror else ghc-options: -O2 -Wall default-language: Haskell2010 benchmark bench-memory main-is: Main.hs hs-source-dirs: bench-memory type: exitcode-stdio-1.0 build-depends: base >= 4.7 && < 5.0 , deepseq >= 1.4 && < 1.5 , text >= 0.2 && < 1.3 , text-metrics , weigh >= 0.0.4 if flag(dev) ghc-options: -O2 -Wall -Werror else ghc-options: -O2 -Wall default-language: Haskell2010