Safe Haskell	None
Language	Haskell2010

QuickSearch.String

Synopsis

buildQuickSearch :: (Hashable uid, Eq uid) => [(String, uid)] -> QuickSearch uid
rawBuildQuickSearch :: (Hashable uid, Eq uid) => [Entry Text uid] -> QuickSearch uid
topNMatches :: (Hashable uid, Eq uid) => QuickSearch uid -> Int -> Scorer -> String -> [Match Score (Entry String uid)]
matchesWithThreshold :: (Hashable uid, Eq uid) => QuickSearch uid -> Int -> Scorer -> String -> [Match Score (Entry String uid)]
batch :: (Hashable uid1, Eq uid1, Hashable uid2, Eq uid2) => (QuickSearch uid2 -> Int -> Scorer -> String -> [Match Score (Entry String uid2)]) -> QuickSearch uid2 -> Int -> Scorer -> [(String, uid1)] -> [(Entry String uid1, [Match Score (Entry String uid2)])]
batchTopNMatches :: (Hashable uid1, Eq uid1, Hashable uid2, Eq uid2) => QuickSearch uid2 -> Int -> Scorer -> [(String, uid1)] -> [(Entry String uid1, [Match Score (Entry String uid2)])]
batchMatchesWithThreshold :: (Hashable uid1, Eq uid1, Hashable uid2, Eq uid2) => QuickSearch uid2 -> Int -> Scorer -> [(String, uid1)] -> [(Entry String uid1, [Match Score (Entry String uid2)])]
type Token = Text
newtype Entry name uid = Entry (name, uid)
type Score = Int
type Scorer = Text -> Text -> Ratio Int
data Match score entry
newtype QuickSearch uid = QuickSearch ([Entry Text uid], HashMap Token (HashSet uid))
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
jaro :: Text -> Text -> Ratio Int
jaroWinkler :: Text -> Text -> Ratio Int

Documentation

buildQuickSearch Source #

Arguments

:: (Hashable uid, Eq uid)
=> [(String, uid)]	List of entries to be searched
-> QuickSearch uid	QuickSearch object holding token partitions

Given a list of pairs of (String, uid) to be searched, create a QuickSearch object.

rawBuildQuickSearch Source #

Arguments

:: (Hashable uid, Eq uid)
=> [Entry Text uid]	List of entries to be searched
-> QuickSearch uid	QuickSearch object holding token partitions

Given a list of entries to be searched, create a QuickSearch object.

topNMatches Source #

Arguments

:: (Hashable uid, Eq uid)
=> QuickSearch uid	QuickSearch object holding token partitions
-> Int	N: Number of results to return
-> Scorer	String similarity function of type (Text -> Text -> Ratio Int)
-> String	String to be searched
-> [Match Score (Entry String uid)]	Top N most similar entries

Given a QuickSearch object, scorer, and string, return the top N matches.

matchesWithThreshold Source #

Arguments

:: (Hashable uid, Eq uid)
=> QuickSearch uid	QuickSearch object holding token partitions
-> Int	Threshold score above which to return results
-> Scorer	String similarity function of type (Text -> Text -> Ratio Int)
-> String	String to be searched
-> [Match Score (Entry String uid)]	Top N most similar entries

Given a QuickSearch object, scorer, and string, return all matches with a score greater than the given threshold.

batch Source #

Arguments

:: (Hashable uid1, Eq uid1, Hashable uid2, Eq uid2)
=> (QuickSearch uid2 -> Int -> Scorer -> String -> [Match Score (Entry String uid2)])	A match retrieval function, such as topNMatches
-> QuickSearch uid2	QuickSearch object holding token partitions
-> Int	The reference number for the match retrieval function. N for topNMatches, threshold for matchesWithThreshold
-> Scorer	String similarity function of type (Text -> Text -> Ratio Int)
-> [(String, uid1)]	List of entries to be processed
-> [(Entry String uid1, [Match Score (Entry String uid2)])]	List of entries and the results returned for each.

Turn a match retrieval function into one that works on lists of entries.

batchTopNMatches Source #

Arguments

:: (Hashable uid1, Eq uid1, Hashable uid2, Eq uid2)
=> QuickSearch uid2	QuickSearch object holding token partitions
-> Int	N: Number of results to return
-> Scorer	String similarity function of type (Text -> Text -> Ratio Int)
-> [(String, uid1)]	List of entries to be processed
-> [(Entry String uid1, [Match Score (Entry String uid2)])]	List of entries and up to the top N matches for each.

Version of topNMatches that processes lists of entries instead of strings.

batchMatchesWithThreshold Source #

Arguments

:: (Hashable uid1, Eq uid1, Hashable uid2, Eq uid2)
=> QuickSearch uid2	QuickSearch object holding token partitions
-> Int	N: Number of results to return
-> Scorer	String similarity function of type (Text -> Text -> Ratio Int)
-> [(String, uid1)]	List of entries to be processed
-> [(Entry String uid1, [Match Score (Entry String uid2)])]	List of entries and their matches above the score threshold.

Version of matchesWithThreshold that processes lists of entries instead of strings.

type Token = Text Source #

newtype Entry name uid Source #

Structure associating a name with its unique identifier

Constructors

Entry (name, uid)

Instances

Instances details

Bifunctor Entry Source #
Instance details Defined in QuickSearch.Internal.Filter Methods bimap :: (a -> b) -> (c -> d) -> Entry a c -> Entry b d # first :: (a -> b) -> Entry a c -> Entry b c # second :: (b -> c) -> Entry a b -> Entry a c #
(Eq name, Eq uid) => Eq (Entry name uid) Source #
Instance details Defined in QuickSearch.Internal.Filter Methods (==) :: Entry name uid -> Entry name uid -> Bool # (/=) :: Entry name uid -> Entry name uid -> Bool #
(Show name, Show uid) => Show (Entry name uid) Source #
Instance details Defined in QuickSearch.Internal.Filter Methods showsPrec :: Int -> Entry name uid -> ShowS # show :: Entry name uid -> String # showList :: [Entry name uid] -> ShowS #

type Score = Int Source #

type Scorer = Text -> Text -> Ratio Int Source #

data Match score entry Source #

Structure associating a Score with an Entry, for holding search results

Instances

Instances details

Bifunctor Match Source #
Instance details Defined in QuickSearch.Internal.Matcher Methods bimap :: (a -> b) -> (c -> d) -> Match a c -> Match b d # first :: (a -> b) -> Match a c -> Match b c # second :: (b -> c) -> Match a b -> Match a c #
(Eq score, Eq entry) => Eq (Match score entry) Source #
Instance details Defined in QuickSearch.Internal.Matcher Methods (==) :: Match score entry -> Match score entry -> Bool # (/=) :: Match score entry -> Match score entry -> Bool #
(Show score, Show entry) => Show (Match score entry) Source #
Instance details Defined in QuickSearch.Internal.Matcher Methods showsPrec :: Int -> Match score entry -> ShowS # show :: Match score entry -> String # showList :: [Match score entry] -> ShowS #

newtype QuickSearch uid Source #

List of entries to be searched and a HashMap associating tokens with HashSets of UIDs related to entries containing the tokens.

Constructors

QuickSearch ([Entry Text uid], HashMap Token (HashSet uid))

Instances

Instances details

Show uid => Show (QuickSearch uid) Source #
Instance details Defined in QuickSearch.Internal.Matcher Methods showsPrec :: Int -> QuickSearch uid -> ShowS # show :: QuickSearch uid -> String # showList :: [QuickSearch uid] -> ShowS #

damerauLevenshteinNorm :: Text -> Text -> Ratio Int #

Return normalized Damerau-Levenshtein distance between two Text values. 0 signifies no similarity between the strings, while 1 means exact match.

Heads up, before version 0.3.0 this function returned Ratio Natural.

jaro :: Text -> Text -> Ratio Int #

Return Jaro distance between two Text values. Returned value is in the range from 0 (no similarity) to 1 (exact match).

While the algorithm is pretty clear for artificial examples (like those from the linked Wikipedia article), for arbitrary strings, it may be hard to decide which of two strings should be considered as one having “reference” order of characters (order of matching characters in an essential part of the definition of the algorithm). This makes us consider the first string the “reference” string (with correct order of characters). Thus generally,

jaro a b ≠ jaro b a

This asymmetry can be found in all implementations of the algorithm on the internet, AFAIK.

Heads up, before version 0.3.0 this function returned Ratio Natural.

Since: text-metrics-0.2.0

jaroWinkler :: Text -> Text -> Ratio Int #

Return Jaro-Winkler distance between two Text values. Returned value is in range from 0 (no similarity) to 1 (exact match).

Heads up, before version 0.3.0 this function returned Ratio Natural.

Since: text-metrics-0.2.0