{-# LANGUAGE ViewPatterns #-}

module QuickSearch.String
    ( buildQuickSearch
    , rawBuildQuickSearch
    , topNMatches
    , matchesWithThreshold
    , batch
    , batchTopNMatches
    , batchMatchesWithThreshold
    , Token
    , Entry(..)
    , Score
    , Scorer
    , Match
    , QuickSearch(..)
    , damerauLevenshteinNorm
    , jaro
    , jaroWinkler
    ) where

import           Data.Bifunctor                 ( Bifunctor(..) )
import           Data.Hashable                  ( Hashable )
import qualified Data.Text                     as T
import           Data.Text.Metrics              ( damerauLevenshteinNorm
                                                , jaro
                                                , jaroWinkler
                                                )

import           QuickSearch                    ( Entry(..)
                                                , Token
                                                , entryName
                                                , rawBuildQuickSearch
                                                )
import           QuickSearch.Internal.Matcher   ( Match(..)
                                                , QuickSearch(..)
                                                , Score
                                                , Scorer
                                                , matchScore
                                                , scoreMatches
                                                )

-- | Given a Match (Entry String uid), return it as a Match (Entry uid)
scoredTextToString
    :: (Hashable uid, Eq uid)
    => Match Score (Entry T.Text uid)
    -> Match Score (Entry String uid)
scoredTextToString :: Match Score (Entry Text uid) -> Match Score (Entry String uid)
scoredTextToString = (Entry Text uid -> Entry String uid)
-> Match Score (Entry Text uid) -> Match Score (Entry String uid)
forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
second ((Entry Text uid -> Entry String uid)
 -> Match Score (Entry Text uid) -> Match Score (Entry String uid))
-> (Entry Text uid -> Entry String uid)
-> Match Score (Entry Text uid)
-> Match Score (Entry String uid)
forall a b. (a -> b) -> a -> b
$ (Text -> String) -> Entry Text uid -> Entry String uid
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
first Text -> String
T.unpack

{- | Given a list of pairs of (String, uid) to be searched,
   create a QuickSearch object.
-}
buildQuickSearch
    :: (Hashable uid, Eq uid)
    => [(String, uid)]  -- ^ List of entries to be searched
    -> QuickSearch uid  -- ^ QuickSearch object holding token partitions
buildQuickSearch :: [(String, uid)] -> QuickSearch uid
buildQuickSearch = [Entry Text uid] -> QuickSearch uid
forall uid.
(Hashable uid, Eq uid) =>
[Entry Text uid] -> QuickSearch uid
rawBuildQuickSearch ([Entry Text uid] -> QuickSearch uid)
-> ([(String, uid)] -> [Entry Text uid])
-> [(String, uid)]
-> QuickSearch uid
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((String, uid) -> Entry Text uid)
-> [(String, uid)] -> [Entry Text uid]
forall a b. (a -> b) -> [a] -> [b]
map ((Text, uid) -> Entry Text uid
forall name uid. (name, uid) -> Entry name uid
Entry ((Text, uid) -> Entry Text uid)
-> ((String, uid) -> (Text, uid))
-> (String, uid)
-> Entry Text uid
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String -> Text) -> (String, uid) -> (Text, uid)
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
first String -> Text
T.pack)

-- | Given a QuickSearch object, scorer, and string, return the top N matches.
topNMatches
    :: (Hashable uid, Eq uid)
    => QuickSearch uid  -- ^ QuickSearch object holding token partitions
    -> Int  -- ^ N: Number of results to return
    -> Scorer  -- ^ String similarity function of type (Text -> Text -> Ratio Int)
    -> String  -- ^ String to be searched
    -> [Match Score (Entry String uid)]  -- ^ Top N most similar entries
topNMatches :: QuickSearch uid
-> Score -> Scorer -> String -> [Match Score (Entry String uid)]
topNMatches QuickSearch uid
qs Score
n Scorer
scorer (String -> Text
T.pack -> Text
entry) = (Match Score (Entry Text uid) -> Match Score (Entry String uid))
-> [Match Score (Entry Text uid)]
-> [Match Score (Entry String uid)]
forall a b. (a -> b) -> [a] -> [b]
map Match Score (Entry Text uid) -> Match Score (Entry String uid)
forall uid.
(Hashable uid, Eq uid) =>
Match Score (Entry Text uid) -> Match Score (Entry String uid)
scoredTextToString [Match Score (Entry Text uid)]
results
    where results :: [Match Score (Entry Text uid)]
results = Score
-> [Match Score (Entry Text uid)] -> [Match Score (Entry Text uid)]
forall a. Score -> [a] -> [a]
take Score
n (Text -> QuickSearch uid -> Scorer -> [Match Score (Entry Text uid)]
forall uid.
(Hashable uid, Eq uid) =>
Text -> QuickSearch uid -> Scorer -> [Match Score (Entry Text uid)]
scoreMatches Text
entry QuickSearch uid
qs Scorer
scorer)

{- | Given a QuickSearch object, scorer, and string, return all matches with a
   score greater than the given threshold.
-}
matchesWithThreshold
    :: (Hashable uid, Eq uid)
    => QuickSearch uid  -- ^ QuickSearch object holding token partitions
    -> Int  -- ^ Threshold score above which to return results
    -> Scorer  -- ^ String similarity function of type (Text -> Text -> Ratio Int)
    -> String  -- ^ String to be searched
    -> [Match Score (Entry String uid)]  -- ^ Top N most similar entries
matchesWithThreshold :: QuickSearch uid
-> Score -> Scorer -> String -> [Match Score (Entry String uid)]
matchesWithThreshold QuickSearch uid
qs Score
cutoff Scorer
scorer (String -> Text
T.pack -> Text
entry) =
    let results :: [Match Score (Entry Text uid)]
results = Text -> QuickSearch uid -> Scorer -> [Match Score (Entry Text uid)]
forall uid.
(Hashable uid, Eq uid) =>
Text -> QuickSearch uid -> Scorer -> [Match Score (Entry Text uid)]
scoreMatches Text
entry QuickSearch uid
qs Scorer
scorer
    in  (Match Score (Entry Text uid) -> Match Score (Entry String uid))
-> [Match Score (Entry Text uid)]
-> [Match Score (Entry String uid)]
forall a b. (a -> b) -> [a] -> [b]
map Match Score (Entry Text uid) -> Match Score (Entry String uid)
forall uid.
(Hashable uid, Eq uid) =>
Match Score (Entry Text uid) -> Match Score (Entry String uid)
scoredTextToString ([Match Score (Entry Text uid)]
 -> [Match Score (Entry String uid)])
-> [Match Score (Entry Text uid)]
-> [Match Score (Entry String uid)]
forall a b. (a -> b) -> a -> b
$ (Match Score (Entry Text uid) -> Bool)
-> [Match Score (Entry Text uid)] -> [Match Score (Entry Text uid)]
forall a. (a -> Bool) -> [a] -> [a]
takeWhile ((Score -> Score -> Bool
forall a. Ord a => a -> a -> Bool
>= Score
cutoff) (Score -> Bool)
-> (Match Score (Entry Text uid) -> Score)
-> Match Score (Entry Text uid)
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Match Score (Entry Text uid) -> Score
forall name uid. Match Score (Entry name uid) -> Score
matchScore) [Match Score (Entry Text uid)]
results

-- | Turn a match retrieval function into one that works on lists of entries.
batch
    :: (Hashable uid1, Eq uid1, Hashable uid2, Eq uid2)
    => (  QuickSearch uid2
       -> Int
       -> Scorer
       -> String
       -> [Match Score (Entry String uid2)]
       )
  -- ^ A match retrieval function, such as topNMatches
    -> QuickSearch uid2  -- ^ QuickSearch object holding token partitions
    -> Int  {- ^ The reference number for the match retrieval function.
             N for topNMatches, threshold for matchesWithThreshold
          -}
    -> Scorer  -- ^ String similarity function of type (Text -> Text -> Ratio Int)
    -> [(String, uid1)]  -- ^ List of entries to be processed
    -> [(Entry String uid1, [Match Score (Entry String uid2)])]
  -- ^ List of entries and the results returned for each.
batch :: (QuickSearch uid2
 -> Score -> Scorer -> String -> [Match Score (Entry String uid2)])
-> QuickSearch uid2
-> Score
-> Scorer
-> [(String, uid1)]
-> [(Entry String uid1, [Match Score (Entry String uid2)])]
batch QuickSearch uid2
-> Score -> Scorer -> String -> [Match Score (Entry String uid2)]
f QuickSearch uid2
qs Score
n Scorer
scorer [(String, uid1)]
entries =
    let entries' :: [Entry String uid1]
entries' = ((String, uid1) -> Entry String uid1)
-> [(String, uid1)] -> [Entry String uid1]
forall a b. (a -> b) -> [a] -> [b]
map (String, uid1) -> Entry String uid1
forall name uid. (name, uid) -> Entry name uid
Entry [(String, uid1)]
entries
        results :: [[Match Score (Entry String uid2)]]
results  = (Entry String uid1 -> [Match Score (Entry String uid2)])
-> [Entry String uid1] -> [[Match Score (Entry String uid2)]]
forall a b. (a -> b) -> [a] -> [b]
map (QuickSearch uid2
-> Score -> Scorer -> String -> [Match Score (Entry String uid2)]
f QuickSearch uid2
qs Score
n Scorer
scorer (String -> [Match Score (Entry String uid2)])
-> (Entry String uid1 -> String)
-> Entry String uid1
-> [Match Score (Entry String uid2)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Entry String uid1 -> String
forall name uid. Entry name uid -> name
entryName) [Entry String uid1]
entries'
    in  [Entry String uid1]
-> [[Match Score (Entry String uid2)]]
-> [(Entry String uid1, [Match Score (Entry String uid2)])]
forall a b. [a] -> [b] -> [(a, b)]
zip [Entry String uid1]
entries' [[Match Score (Entry String uid2)]]
results


-- | Version of topNMatches that processes lists of entries instead of strings.
batchTopNMatches
    :: (Hashable uid1, Eq uid1, Hashable uid2, Eq uid2)
    => QuickSearch uid2  -- ^ QuickSearch object holding token partitions
    -> Int  -- ^ N: Number of results to return
    -> Scorer  -- ^ String similarity function of type (Text -> Text -> Ratio Int)
    -> [(String, uid1)]  -- ^ List of entries to be processed
    -> [(Entry String uid1, [Match Score (Entry String uid2)])]
  -- ^ List of entries and up to the top N matches for each.
batchTopNMatches :: QuickSearch uid2
-> Score
-> Scorer
-> [(String, uid1)]
-> [(Entry String uid1, [Match Score (Entry String uid2)])]
batchTopNMatches = (QuickSearch uid2
 -> Score -> Scorer -> String -> [Match Score (Entry String uid2)])
-> QuickSearch uid2
-> Score
-> Scorer
-> [(String, uid1)]
-> [(Entry String uid1, [Match Score (Entry String uid2)])]
forall uid1 uid2.
(Hashable uid1, Eq uid1, Hashable uid2, Eq uid2) =>
(QuickSearch uid2
 -> Score -> Scorer -> String -> [Match Score (Entry String uid2)])
-> QuickSearch uid2
-> Score
-> Scorer
-> [(String, uid1)]
-> [(Entry String uid1, [Match Score (Entry String uid2)])]
batch QuickSearch uid2
-> Score -> Scorer -> String -> [Match Score (Entry String uid2)]
forall uid.
(Hashable uid, Eq uid) =>
QuickSearch uid
-> Score -> Scorer -> String -> [Match Score (Entry String uid)]
topNMatches

{- | Version of matchesWithThreshold that processes lists of entries
   instead of strings.
-}
batchMatchesWithThreshold
    :: (Hashable uid1, Eq uid1, Hashable uid2, Eq uid2)
    => QuickSearch uid2  -- ^ QuickSearch object holding token partitions
    -> Int  -- ^ N: Number of results to return
    -> Scorer  -- ^ String similarity function of type (Text -> Text -> Ratio Int)
    -> [(String, uid1)]  -- ^ List of entries to be processed
    -> [(Entry String uid1, [Match Score (Entry String uid2)])]
  -- ^ List of entries and their matches above the score threshold.
batchMatchesWithThreshold :: QuickSearch uid2
-> Score
-> Scorer
-> [(String, uid1)]
-> [(Entry String uid1, [Match Score (Entry String uid2)])]
batchMatchesWithThreshold = (QuickSearch uid2
 -> Score -> Scorer -> String -> [Match Score (Entry String uid2)])
-> QuickSearch uid2
-> Score
-> Scorer
-> [(String, uid1)]
-> [(Entry String uid1, [Match Score (Entry String uid2)])]
forall uid1 uid2.
(Hashable uid1, Eq uid1, Hashable uid2, Eq uid2) =>
(QuickSearch uid2
 -> Score -> Scorer -> String -> [Match Score (Entry String uid2)])
-> QuickSearch uid2
-> Score
-> Scorer
-> [(String, uid1)]
-> [(Entry String uid1, [Match Score (Entry String uid2)])]
batch QuickSearch uid2
-> Score -> Scorer -> String -> [Match Score (Entry String uid2)]
forall uid.
(Hashable uid, Eq uid) =>
QuickSearch uid
-> Score -> Scorer -> String -> [Match Score (Entry String uid)]
matchesWithThreshold