{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE GeneralisedNewtypeDeriving #-}

module QuickSearch.Internal.Filter
    ( buildTokenPartitions
    , getSearchPartition
    , wordTokenize
    , toTokenizedTuple
    , Token
    , Entry(..)
    , entryName
    , entryUID
    , first
    ) where

import           Control.Arrow                  ( Arrow((&&&)) )
import           Data.Bifunctor                 ( Bifunctor(bimap, first) )
import           Data.Char                      ( isAlphaNum
                                                , isSpace
                                                )
import qualified Data.HashMap.Lazy             as HMap
import qualified Data.HashSet                  as HSet
import           Data.Hashable                  ( Hashable )
import           Data.Maybe                     ( fromMaybe )
import qualified Data.Text                     as T

type Token = T.Text

-- | Structure associating a name with its unique identifier
newtype Entry name uid = Entry (name, uid)
  deriving newtype (Int -> Entry name uid -> ShowS
[Entry name uid] -> ShowS
Entry name uid -> String
(Int -> Entry name uid -> ShowS)
-> (Entry name uid -> String)
-> ([Entry name uid] -> ShowS)
-> Show (Entry name uid)
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
forall name uid.
(Show name, Show uid) =>
Int -> Entry name uid -> ShowS
forall name uid. (Show name, Show uid) => [Entry name uid] -> ShowS
forall name uid. (Show name, Show uid) => Entry name uid -> String
showList :: [Entry name uid] -> ShowS
$cshowList :: forall name uid. (Show name, Show uid) => [Entry name uid] -> ShowS
show :: Entry name uid -> String
$cshow :: forall name uid. (Show name, Show uid) => Entry name uid -> String
showsPrec :: Int -> Entry name uid -> ShowS
$cshowsPrec :: forall name uid.
(Show name, Show uid) =>
Int -> Entry name uid -> ShowS
Show, Entry name uid -> Entry name uid -> Bool
(Entry name uid -> Entry name uid -> Bool)
-> (Entry name uid -> Entry name uid -> Bool)
-> Eq (Entry name uid)
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
forall name uid.
(Eq name, Eq uid) =>
Entry name uid -> Entry name uid -> Bool
/= :: Entry name uid -> Entry name uid -> Bool
$c/= :: forall name uid.
(Eq name, Eq uid) =>
Entry name uid -> Entry name uid -> Bool
== :: Entry name uid -> Entry name uid -> Bool
$c== :: forall name uid.
(Eq name, Eq uid) =>
Entry name uid -> Entry name uid -> Bool
Eq)

instance Bifunctor Entry where
    bimap :: (a -> b) -> (c -> d) -> Entry a c -> Entry b d
bimap a -> b
f c -> d
g (Entry (a
name, c
uid)) = (b, d) -> Entry b d
forall name uid. (name, uid) -> Entry name uid
Entry (a -> b
f a
name, c -> d
g c
uid)

-- | Name accessor for an Entry
entryName :: Entry name uid -> name
entryName :: Entry name uid -> name
entryName (Entry (name
name, uid
_)) = name
name

-- | UID accessor for an Entry
entryUID :: Entry name uid -> uid
entryUID :: Entry name uid -> uid
entryUID (Entry (name
_, uid
uid)) = uid
uid

{- | Turn a Data.Text.Text string into a list of casefolded tokens.
     Turns most non-Alphanum into spaces and
     deletes all periods and apostrophes.

     >>> wordTokenize ("Jane Smith-Walker, M.D."::T.Text)
     ["jane", "smith", "walker", "md"]
-}
wordTokenize
    :: T.Text  -- ^ The target string
    -> [Token]  -- ^ A list of tokens from the target string, casefolded
wordTokenize :: Text -> [Text]
wordTokenize = Text -> [Text]
T.words (Text -> [Text]) -> (Text -> Text) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
T.toCaseFold (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char) -> Text -> Text
T.map Char -> Char
symToSpace (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Text -> Text
T.filter (Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` String
".'")
  where
    symToSpace :: Char -> Char
    symToSpace :: Char -> Char
symToSpace Char
c | Char -> Bool
isAlphaNum Char
c Bool -> Bool -> Bool
|| Char -> Bool
isSpace Char
c = Char
c
                 | Bool
otherwise = Char
' '

{- | Convert an Entry T.Text uid to a tuple of ([wordTokenize name], uid)

     >>> toTokenizedTuple ("Jane Smith-Walker, M.D.", 1)::Entry
     (["jane", "smith", "walker", "md"], 1)
-}
toTokenizedTuple :: (Hashable uid, Eq uid) => Entry T.Text uid -> ([Token], uid)
toTokenizedTuple :: Entry Text uid -> ([Text], uid)
toTokenizedTuple = Text -> [Text]
wordTokenize (Text -> [Text])
-> (Entry Text uid -> Text) -> Entry Text uid -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Entry Text uid -> Text
forall name uid. Entry name uid -> name
entryName (Entry Text uid -> [Text])
-> (Entry Text uid -> uid) -> Entry Text uid -> ([Text], uid)
forall (a :: * -> * -> *) b c c'.
Arrow a =>
a b c -> a b c' -> a b (c, c')
&&& Entry Text uid -> uid
forall name uid. Entry name uid -> uid
entryUID

{- | Given the list of entries to be held by QuickSearch, return a HashMap
   keyed on tokens from the strings in the entries, where the associated
   HashMap value is the list of uids of entries containing the token.
-}
buildTokenPartitions
    :: (Hashable uid, Eq uid)
    => [Entry T.Text uid]  -- ^ List of entries
    -> HMap.HashMap Token (HSet.HashSet uid)  -- ^ A map of Token -> [uids]
buildTokenPartitions :: [Entry Text uid] -> HashMap Text (HashSet uid)
buildTokenPartitions = [([Text], uid)] -> HashMap Text (HashSet uid)
forall uid.
(Hashable uid, Eq uid) =>
[([Text], uid)] -> HashMap Text (HashSet uid)
tokenPartitions ([([Text], uid)] -> HashMap Text (HashSet uid))
-> ([Entry Text uid] -> [([Text], uid)])
-> [Entry Text uid]
-> HashMap Text (HashSet uid)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Entry Text uid -> ([Text], uid))
-> [Entry Text uid] -> [([Text], uid)]
forall a b. (a -> b) -> [a] -> [b]
map Entry Text uid -> ([Text], uid)
forall uid.
(Hashable uid, Eq uid) =>
Entry Text uid -> ([Text], uid)
toTokenizedTuple

{- | Given a list of tokenized entries to be held by QuickSearch,
   return a HashMap keyed on the set of distinct tokens where the associated
   HashMap value is the list of uids of entries containing the token.
-}
tokenPartitions
    :: forall uid
     . (Hashable uid, Eq uid)
    => [([Token], uid)]  -- ^ List of tokenized entries
    -> HMap.HashMap Token (HSet.HashSet uid)  -- ^ A map of Token -> [uids]
tokenPartitions :: [([Text], uid)] -> HashMap Text (HashSet uid)
tokenPartitions [([Text], uid)]
tokenizedEntries =
    [(Text, HashSet uid)] -> HashMap Text (HashSet uid)
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
HMap.fromList ([(Text, HashSet uid)] -> HashMap Text (HashSet uid))
-> [(Text, HashSet uid)] -> HashMap Text (HashSet uid)
forall a b. (a -> b) -> a -> b
$ [ (Text
tok, (Hashable uid, Eq uid) => Text -> HashSet uid
Text -> HashSet uid
allWith Text
tok) | Text
tok <- [Text]
allTokens ]
  where
    unstableNub :: [Token] -> [Token]
    -- ^ Quick dedupe of a list. Does not preserve order.
    unstableNub :: [Text] -> [Text]
unstableNub = HashSet Text -> [Text]
forall a. HashSet a -> [a]
HSet.toList (HashSet Text -> [Text])
-> ([Text] -> HashSet Text) -> [Text] -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Text] -> HashSet Text
forall a. (Eq a, Hashable a) => [a] -> HashSet a
HSet.fromList
    allTokens :: [Text]
allTokens   = [Text] -> [Text]
unstableNub ([Text] -> [Text])
-> ([([Text], uid)] -> [Text]) -> [([Text], uid)] -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (([Text], uid) -> [Text]) -> [([Text], uid)] -> [Text]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ([Text], uid) -> [Text]
forall a b. (a, b) -> a
fst ([([Text], uid)] -> [Text]) -> [([Text], uid)] -> [Text]
forall a b. (a -> b) -> a -> b
$ [([Text], uid)]
tokenizedEntries
    allWith :: (Hashable uid, Eq uid) => Token -> HSet.HashSet uid
    allWith :: Text -> HashSet uid
allWith Text
token =
        [uid] -> HashSet uid
forall a. (Eq a, Hashable a) => [a] -> HashSet a
HSet.fromList ([uid] -> HashSet uid)
-> ([([Text], uid)] -> [uid]) -> [([Text], uid)] -> HashSet uid
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (([Text], uid) -> uid) -> [([Text], uid)] -> [uid]
forall a b. (a -> b) -> [a] -> [b]
map ([Text], uid) -> uid
forall a b. (a, b) -> b
snd ([([Text], uid)] -> [uid])
-> ([([Text], uid)] -> [([Text], uid)]) -> [([Text], uid)] -> [uid]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (([Text], uid) -> Bool) -> [([Text], uid)] -> [([Text], uid)]
forall a. (a -> Bool) -> [a] -> [a]
filter ((Text
token Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`) ([Text] -> Bool)
-> (([Text], uid) -> [Text]) -> ([Text], uid) -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Text], uid) -> [Text]
forall a b. (a, b) -> a
fst) ([([Text], uid)] -> HashSet uid) -> [([Text], uid)] -> HashSet uid
forall a b. (a -> b) -> a -> b
$ [([Text], uid)]
tokenizedEntries

{- | Given a target string and a Token HashMap, return the union of
   sets of uids associated with the tokens in the target string
-}
getSearchPartition
    :: (Hashable uid, Eq uid)
    => T.Text  -- ^ Target string
    -> HMap.HashMap Token (HSet.HashSet uid)
  -- ^ HashMap associating tokens with sets of uids
    -> HSet.HashSet uid  -- ^ The union of sets of associated uids.
getSearchPartition :: Text -> HashMap Text (HashSet uid) -> HashSet uid
getSearchPartition Text
name HashMap Text (HashSet uid)
tokenMap =
    let tokens :: [Text]
tokens = Text -> [Text]
wordTokenize Text
name
    in  [HashSet uid] -> HashSet uid
forall a. (Eq a, Hashable a) => [HashSet a] -> HashSet a
HSet.unions ([HashSet uid] -> HashSet uid) -> [HashSet uid] -> HashSet uid
forall a b. (a -> b) -> a -> b
$ (Text -> HashSet uid) -> [Text] -> [HashSet uid]
forall a b. (a -> b) -> [a] -> [b]
map (HashSet uid -> Maybe (HashSet uid) -> HashSet uid
forall a. a -> Maybe a -> a
fromMaybe HashSet uid
forall a. HashSet a
HSet.empty (Maybe (HashSet uid) -> HashSet uid)
-> (Text -> Maybe (HashSet uid)) -> Text -> HashSet uid
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Text -> HashMap Text (HashSet uid) -> Maybe (HashSet uid)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HMap.lookup` HashMap Text (HashSet uid)
tokenMap)) [Text]
tokens