{-# LANGUAGE BangPatterns #-}
-- |-- Module : Data.ByteString.Lazy.Search.KarpRabin-- Copyright : (c) 2010 Daniel Fischer-- Licence : BSD3-- Maintainer : Daniel Fischer <daniel.is.fischer@googlemail.com>-- Stability : Provisional-- Portability : non-portable (BangPatterns)---- Simultaneous search for multiple patterns in a lazy 'L.ByteString'-- using the Karp-Rabin algorithm.---- A description of the algorithm for a single pattern can be found at-- <http://www-igm.univ-mlv.fr/~lecroq/string/node5.html#SECTION0050>.
module Data.ByteString.Lazy.Search.KarpRabin ( -- * Overview-- $overview-- ** Caution-- $caution-- * Function
indicesOfAny
) where
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Unsafe (unsafeIndex)
import qualified Data.IntMap as IM
import Data.Array
import Data.Array.Base (unsafeAt)
import Data.Word (Word8)
import Data.Int (Int64)
import Data.Bits
import Data.List (foldl')
-- $overview---- The Karp-Rabin algorithm works by calculating a hash of the pattern and-- comparing that hash with the hash of a slice of the target string with-- the same length as the pattern. If the hashes are equal, the slice of the-- target is compared to the pattern character by character (since the hash-- function generally isn't injective).---- For a single pattern, this tends to be more efficient than the naïve-- algorithm, but it cannot compete with algorithms like-- Knuth-Morris-Pratt or Boyer-Moore.---- However, the algorithm can be generalised to search for multiple patterns-- simultaneously. If the shortest pattern has length @k@, hash the prefix of-- length @k@ of all patterns and compare the hash of the target's slices of-- length @k@ to them. If there's a match, check whether the slice is part-- of an occurrence of the corresponding pattern.---- With a hash-function that---- * allows to compute the hash of one slice in constant time from the hash-- of the previous slice, the new and the dropped character, and---- * produces few spurious matches,---- searching for occurrences of any of @n@ patterns has a best-case complexity-- of /O/(@targetLength@ * @lookup n@). The worst-case complexity is-- /O/(@targetLength@ * @lookup n@ * @sum patternLengths@), the average is-- not much worse than the best case.---- The functions in this module store the hashes of the patterns in an-- 'IM.IntMap', so the lookup is /O/(@log n@). Re-hashing is done in constant-- time and spurious matches of the hashes /should be/ sufficiently rare.-- The maximal length of the prefixes to be hashed is 32.-- $caution---- Unfortunately, the constant factors are high, so these functions are slow.-- Unless the number of patterns to search for is high (larger than 50 at-- least), repeated search for single patterns using Boyer-Moore or DFA and-- manual merging of the indices is faster. /Much/ faster for less than 40-- or so patterns.---- 'indicesOfAny' has the advantage over multiple single-pattern searches that-- it doesn't hold on to large parts of the string (which is likely to happen-- for multiple searches), however, so in contrast to the strict version, it-- may be useful for relatively few patterns already.---- Nevertheless, this module seems more of an interesting curiosity than-- anything else.-- | @'indicesOfAny'@ finds all occurrences of any of several non-empty strict-- patterns in a lazy target string. If no non-empty patterns are given,-- the result is an empty list. Otherwise the result list contains-- the pairs of all indices where any of the (non-empty) patterns start-- and the list of all patterns starting at that index, the patterns being-- represented by their (zero-based) position in the pattern list.-- Empty patterns are filtered out before processing begins.{-# INLINE indicesOfAny #-}indicesOfAny :: [S.ByteString] -- ^ List of non-empty patterns
-> L.ByteString-- ^ String to search
-> [(Int64,[Int])] -- ^ List of matchesindicesOfAnypats
| nullnepats = const []
| otherwise = lazyMatchernepats.L.toChunks
where
nepats = filter (not.S.null) pats-------------------------------------------------------------------------------- Workers --------------------------------------------------------------------------------{-# INLINE rehash1 #-}rehash1 :: Int -> Int -> Word8 -> Word8 -> Intrehash1outhon =
(h`shiftL`1- (fromIntegralo`shiftL`out)) +fromIntegraln{-# INLINE rehash2 #-}rehash2 :: Int -> Int -> Word8 -> Word8 -> Intrehash2outhon =
(h`shiftL`2- (fromIntegralo`shiftL`out)) +fromIntegraln{-# INLINE rehash3 #-}rehash3 :: Int -> Int -> Word8 -> Word8 -> Intrehash3outhon =
(h`shiftL`3- (fromIntegralo`shiftL`out)) +fromIntegraln{-# INLINE rehash4 #-}rehash4 :: Int -> Int -> Word8 -> Word8 -> Intrehash4outhon =
(h`shiftL`4- (fromIntegralo`shiftL`out)) +fromIntegralnlazyMatcher :: [S.ByteString] -> [S.ByteString] -> [(Int64,[Int])]
lazyMatcherpats = search0hLenS.empty
where
!hLen = minimum (32:mapS.lengthpats)
!shDi = case 32`quot`hLen of
q | q<4 -> q
| otherwise -> 4
!outS = shDi*hLen
!patNum = lengthpats
!patArr = listArray (0, patNum-1) pats{-# INLINE rehash #-}rehash :: Int -> Word8 -> Word8 -> Intrehash = case shDi of
1 -> rehash1hLen2 -> rehash2outS3 -> rehash3outS
_ -> rehash4outShash :: S.ByteString -> Inthash = S.foldl' (\hw -> (h`shiftL`shDi) +fromIntegralw) 0.S.takehLen
!hashMap =
foldl' (\mp (h,i) -> IM.insertWith (flip(++)) h [i] mp) IM.empty$zip (maphashpats) [0 :: Int .. ]search _ _ _ [] = []
search !h !rm !prev (!str:rest)
| strLen<rm =
let !h' = S.foldl' (\ow -> (o`shiftL`1) +fromIntegralw) hstr
!prev' = S.appendprevstr
in searchh' (rm-strLen) prev'rest
| otherwise =
let !h' = S.foldl' (\ow -> (o`shiftL`1) +fromIntegralw) h
(S.takermstr)
in if S.nullprev
then noPast0reststrh'
else past0restprev0strrmh'
where
!strLen = S.lengthstrnoPast !priorrest !strhsh = gohsh0
where
!strLen = S.lengthstr
!maxIdx = strLen-hLen{-# INLINE strAt #-}strAt !i = unsafeIndexstrigo !hsI =
case IM.lookuphhashMap of
Nothing ->
if sI==maxIdx
then case rest of
[] -> []
(nxt:more) ->
let !h' = rehashh (strAtsI) (unsafeIndexnxt0)
!prior' = prior+fromIntegralstrLen
!prev = S.drop (sI+1) str
in if hLen==1
then noPastprior'morenxth'
else pastprior'moreprev0nxt1h'
else go (rehashh (strAtsI) (strAt (sI+hLen))) (sI+1)
Justps ->
let !rst = S.dropsIstr
!rLen = strLen-sI{-# INLINE hd #-}hd = strAtsI{-# INLINE more #-}more =
if sI==maxIdx
then case rest of
[] -> []
(nxt:fut) ->
let !h' = rehashhhd (unsafeIndexnxt0)
!prior' = prior+fromIntegralstrLen
in if hLen==1
then noPastprior'futnxth'
else pastprior'futrst1nxt1h'
else go (rehashhhd (strAt (sI+hLen))) (sI+1)
okaybs
| rLen<S.lengthbs = S.isPrefixOfrstbs&&checkFut (S.droprLenbs) rest
| otherwise = S.isPrefixOfbsrst
in case filter (okay. (patArr`unsafeAt`)) ps of
[] -> moreqs -> seq (lengthqs) $
(prior+fromIntegralsI,qs) :morepast !priorrest !prev !pI !str !sI !hsh
| strLen<4040 =
let !prior' = prior-1+fromIntegral (sI-hLen)
!curr = S.append (S.droppIprev) str
in noPastprior'restcurrhsh
| otherwise = gohshpIsI
where
!strLen = S.lengthstr{-# INLINE strAt #-}strAt !i = unsafeIndexstri{-# INLINE prevAt #-}prevAt !i = unsafeIndexprevigo !h !p !s
| s==hLen = noPastpriorreststrh
| otherwise =
case IM.lookuphhashMap of
Nothing ->
let {-# INLINE h' #-}h' = rehashh (prevAtp) (strAts)
in goh' (p+1) (s+1)
Justps ->
let !prst = S.droppprev{-# INLINE more #-}more = go (rehashh (prevAtp) (strAts)) (p+1) (s+1)
okaybs = checkFutbs (prst:str:rest)
in case filter (okay. (unsafeAtpatArr)) ps of
[] -> moreqs -> seq (lengthqs) $
(prior+fromIntegral (s-hLen), qs) :more{-# INLINE checkFut #-}checkFut :: S.ByteString -> [S.ByteString] -> BoolcheckFut _ [] = FalsecheckFut !bs (!h:t)
| hLen<S.lengthbs = S.isPrefixOfhbs&&checkFut (S.drophLenbs) t
| otherwise = S.isPrefixOfbsh
where
!hLen = S.lengthh