{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_HADDOCK hide, prune #-}
module Data.ByteString.Lazy.Search.Internal.BoyerMoore (
matchLL
, matchSL
, matchNOL
, replaceAllL
, breakSubstringL
, breakAfterL
, breakFindAfterL
, splitKeepEndL
, splitKeepFrontL
, splitDropL
) where
import Data.ByteString.Search.Internal.Utils
(occurs, suffShifts, ldrop, lsplit, keep, release, strictify)
import Data.ByteString.Search.Substitution
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Unsafe (unsafeIndex)
import Data.Array.Base (unsafeAt)
import Data.Word (Word8)
import Data.Int (Int64)
{-# INLINE matchLL #-}
matchLL :: L.ByteString
-> L.ByteString
-> [Int64]
matchLL pat = search . L.toChunks
where
search = lazySearcher True (strictify pat)
{-# INLINE matchSL #-}
matchSL :: S.ByteString
-> L.ByteString
-> [Int64]
matchSL pat = search . L.toChunks
where
search = lazySearcher True pat
{-# INLINE matchNOL #-}
matchNOL :: S.ByteString
-> L.ByteString
-> [Int64]
matchNOL pat = search . L.toChunks
where
search = lazySearcher False pat
{-# INLINE replaceAllL #-}
replaceAllL :: Substitution rep
=> S.ByteString
-> rep
-> L.ByteString
-> L.ByteString
replaceAllL pat
| S.null pat = \sub -> prependCycle sub
| S.length pat == 1 =
let breaker = lazyBreak pat
repl subst strs
| null strs = []
| otherwise =
case breaker strs of
(pre, mtch) ->
pre ++ case mtch of
[] -> []
_ -> subst (repl subst (ldrop 1 mtch))
in \sub -> let repl1 = repl (substitution sub)
in L.fromChunks . repl1 . L.toChunks
| otherwise =
let repl = lazyRepl pat
in \sub -> let repl1 = repl (substitution sub)
in L.fromChunks . repl1 . L.toChunks
{-# INLINE breakSubstringL #-}
breakSubstringL :: S.ByteString
-> L.ByteString
-> (L.ByteString, L.ByteString)
breakSubstringL pat = breaker . L.toChunks
where
lbrk = lazyBreak pat
breaker strs = let (f, b) = lbrk strs
in (L.fromChunks f, L.fromChunks b)
breakAfterL :: S.ByteString
-> L.ByteString
-> (L.ByteString, L.ByteString)
breakAfterL pat
| S.null pat = \str -> (L.empty, str)
breakAfterL pat = breaker' . L.toChunks
where
!patLen = S.length pat
breaker = lazyBreak pat
breaker' strs =
let (pre, mtch) = breaker strs
(pl, a) = if null mtch then ([],[]) else lsplit patLen mtch
in (L.fromChunks (pre ++ pl), L.fromChunks a)
breakFindAfterL :: S.ByteString
-> L.ByteString
-> ((L.ByteString, L.ByteString), Bool)
breakFindAfterL pat
| S.null pat = \str -> ((L.empty, str), True)
breakFindAfterL pat = breaker' . L.toChunks
where
!patLen = S.length pat
breaker = lazyBreak pat
breaker' strs =
let (pre, mtch) = breaker strs
(pl, a) = if null mtch then ([],[]) else lsplit patLen mtch
in ((L.fromChunks (pre ++ pl), L.fromChunks a), not (null mtch))
{-# INLINE splitKeepEndL #-}
splitKeepEndL :: S.ByteString
-> L.ByteString
-> [L.ByteString]
splitKeepEndL pat
| S.null pat = const (repeat L.empty)
| otherwise =
let splitter = lazySplitKeepEnd pat
in map L.fromChunks . splitter . L.toChunks
{-# INLINE splitKeepFrontL #-}
splitKeepFrontL :: S.ByteString
-> L.ByteString
-> [L.ByteString]
splitKeepFrontL pat
| S.null pat = const (repeat L.empty)
| otherwise =
let splitter = lazySplitKeepFront pat
in map L.fromChunks . splitter . L.toChunks
{-# INLINE splitDropL #-}
splitDropL :: S.ByteString
-> L.ByteString
-> [L.ByteString]
splitDropL pat
| S.null pat = const (repeat L.empty)
| otherwise =
let splitter = lazySplitDrop pat
in map L.fromChunks . splitter . L.toChunks
lazySearcher :: Bool -> S.ByteString -> [S.ByteString] -> [Int64]
lazySearcher _ !pat
| S.null pat =
let zgo !prior [] = [prior]
zgo prior (!str : rest) =
let !l = S.length str
!prior' = prior + fromIntegral l
in [prior + fromIntegral i | i <- [0 .. l-1]] ++ zgo prior' rest
in zgo 0
| S.length pat == 1 =
let !w = S.head pat
ixes = S.elemIndices w
go _ [] = []
go !prior (!str : rest)
= let !prior' = prior + fromIntegral (S.length str)
in map ((+ prior) . fromIntegral) (ixes str) ++ go prior' rest
in go 0
lazySearcher !overlap pat = searcher
where
{-# INLINE patAt #-}
patAt :: Int -> Word8
patAt !i = unsafeIndex pat i
!patLen = S.length pat
!patEnd = patLen - 1
{-# INLINE preEnd #-}
preEnd = patEnd - 1
!maxLen = maxBound - patLen
!occT = occurs pat
!suffT = suffShifts pat
!skip = if overlap then unsafeAt suffT 0 else patLen
!kept = patLen - skip
!pe = patAt patEnd
{-# INLINE occ #-}
occ !w = unsafeAt occT (fromIntegral w)
{-# INLINE suff #-}
suff !i = unsafeAt suffT i
searcher lst = case lst of
[] -> []
(h : t) ->
if maxLen < S.length h
then error "Overflow in BoyerMoore.lazySearcher"
else seek 0 [] h t 0 patEnd
seek :: Int64 -> [S.ByteString] -> S.ByteString
-> [S.ByteString] -> Int -> Int -> [Int64]
seek !prior !past !str future !diffPos !patPos
| strPos < 0 =
case past of
(h : t) ->
let !hLen = S.length h
in seek (prior - fromIntegral hLen) t h (str : future)
(diffPos + hLen) patPos
[] -> error "seek back too far!"
| strEnd < strPos =
case future of
(h : t) ->
let {-# INLINE prior' #-}
prior' = prior + fromIntegral strLen
!diffPos' = diffPos - strLen
{-# INLINE past' #-}
past' = release (-diffPos') (str : past)
in if maxLen < S.length h
then error "Overflow in BoyerMoore.lazySearcher"
else seek prior' past' h t diffPos' patPos
[] -> []
| patPos == patEnd = checkEnd strPos
| diffPos < 0 = matcherN diffPos patPos
| otherwise = matcherP diffPos patPos
where
!strPos = diffPos + patPos
!strLen = S.length str
!strEnd = strLen - 1
!maxDiff = strLen - patLen
{-# INLINE strAt #-}
strAt !i = unsafeIndex str i
checkEnd !sI
| strEnd < sI = seek prior past str future (sI - patEnd) patEnd
| otherwise =
case strAt sI of
!c | c == pe ->
if sI < patEnd
then case sI of
0 -> seek prior past str future (-patEnd) preEnd
_ -> matcherN (sI - patEnd) preEnd
else matcherP (sI - patEnd) preEnd
| otherwise -> checkEnd (sI + patEnd + occ c)
matcherN !diff !patI =
case strAt (diff + patI) of
!c | c == patAt patI ->
if diff + patI == 0
then seek prior past str future diff (patI - 1)
else matcherN diff (patI - 1)
| otherwise ->
let {-# INLINE badShift #-}
badShift = patI + occ c
{-# INLINE goodShift #-}
goodShift = suff patI
!diff' = diff + max badShift goodShift
in if maxDiff < diff'
then seek prior past str future diff' patEnd
else checkEnd (diff' + patEnd)
matcherP !diff !patI =
case strAt (diff + patI) of
!c | c == patAt patI ->
if patI == 0
then prior + fromIntegral diff :
let !diff' = diff + skip
in if maxDiff < diff'
then seek prior past str future diff' patEnd
else
if skip == patLen
then
checkEnd (diff' + patEnd)
else
afterMatch diff' patEnd
else matcherP diff (patI - 1)
| otherwise ->
let {-# INLINE badShift #-}
badShift = patI + occ c
{-# INLINE goodShift #-}
goodShift = suff patI
!diff' = diff + max badShift goodShift
in if maxDiff < diff'
then seek prior past str future diff' patEnd
else checkEnd (diff' + patEnd)
afterMatch !diff !patI =
case strAt (diff + patI) of
!c | c == patAt patI ->
if patI == kept
then prior + fromIntegral diff :
let !diff' = diff + skip
in if maxDiff < diff'
then seek prior past str future diff' patEnd
else afterMatch diff' patEnd
else afterMatch diff (patI - 1)
| patI == patEnd ->
checkEnd (diff + (2*patEnd) + occ c)
| otherwise ->
let {-# INLINE badShift #-}
badShift = patI + occ c
{-# INLINE goodShift #-}
goodShift = suff patI
!diff' = diff + max badShift goodShift
in if maxDiff < diff'
then seek prior past str future diff' patEnd
else checkEnd (diff' + patEnd)
lazyBreak ::S.ByteString -> [S.ByteString] -> ([S.ByteString], [S.ByteString])
lazyBreak !pat
| S.null pat = \lst -> ([],lst)
| S.length pat == 1 =
let !w = S.head pat
go [] = ([], [])
go (!str : rest) =
case S.elemIndices w str of
[] -> let (pre, post) = go rest in (str : pre, post)
(i:_) -> if i == 0
then ([], str : rest)
else ([S.take i str], S.drop i str : rest)
in go
lazyBreak pat = breaker
where
!patLen = S.length pat
!patEnd = patLen - 1
!occT = occurs pat
!suffT = suffShifts pat
!maxLen = maxBound - patLen
!pe = patAt patEnd
{-# INLINE patAt #-}
patAt !i = unsafeIndex pat i
{-# INLINE occ #-}
occ !w = unsafeAt occT (fromIntegral w)
{-# INLINE suff #-}
suff !i = unsafeAt suffT i
breaker lst =
case lst of
[] -> ([],[])
(h:t) ->
if maxLen < S.length h
then error "Overflow in BoyerMoore.lazyBreak"
else seek [] h t 0 patEnd
seek :: [S.ByteString] -> S.ByteString -> [S.ByteString]
-> Int -> Int -> ([S.ByteString], [S.ByteString])
seek !past !str future !offset !patPos
| strPos < 0 =
case past of
[] -> error "not enough past!"
(h : t) -> seek t h (str : future) (offset + S.length h) patPos
| strEnd < strPos =
case future of
[] -> (foldr (flip (.) . (:)) id past [str], [])
(h : t) ->
let !off' = offset - strLen
(past', !discharge) = keep (-off') (str : past)
in if maxLen < S.length h
then error "Overflow in BoyerMoore.lazyBreak (future)"
else let (pre,post) = seek past' h t off' patPos
in (foldr (flip (.) . (:)) id discharge pre, post)
| patPos == patEnd = checkEnd strPos
| offset < 0 = matcherN offset patPos
| otherwise = matcherP offset patPos
where
{-# INLINE strAt #-}
strAt !i = unsafeIndex str i
!strLen = S.length str
!strEnd = strLen - 1
!maxOff = strLen - patLen
!strPos = offset + patPos
checkEnd !sI
| strEnd < sI = seek past str future (sI - patEnd) patEnd
| otherwise =
case strAt sI of
!c | c == pe ->
if sI < patEnd
then (if sI == 0
then seek past str future (-patEnd) (patEnd - 1)
else matcherN (sI - patEnd) (patEnd - 1))
else matcherP (sI - patEnd) (patEnd - 1)
| otherwise -> checkEnd (sI + patEnd + occ c)
matcherN !off !patI =
case strAt (off + patI) of
!c | c == patAt patI ->
if off + patI == 0
then seek past str future off (patI - 1)
else matcherN off (patI - 1)
| otherwise ->
let !off' = off + max (suff patI) (patI + occ c)
in if maxOff < off'
then seek past str future off' patEnd
else checkEnd (off' + patEnd)
matcherP !off !patI =
case strAt (off + patI) of
!c | c == patAt patI ->
if patI == 0
then let !pre = if off == 0 then [] else [S.take off str]
!post = S.drop off str
in (foldr (flip (.) . (:)) id past pre, post:future)
else matcherP off (patI - 1)
| otherwise ->
let !off' = off + max (suff patI) (patI + occ c)
in if maxOff < off'
then seek past str future off' patEnd
else checkEnd (off' + patEnd)
lazySplitKeepFront :: S.ByteString -> [S.ByteString] -> [[S.ByteString]]
lazySplitKeepFront pat = splitter'
where
!patLen = S.length pat
breaker = lazyBreak pat
splitter' strs = case splitter strs of
([]:rest) -> rest
other -> other
splitter [] = []
splitter strs =
case breaker strs of
(pre, mtch) ->
pre : case mtch of
[] -> []
_ -> case lsplit patLen mtch of
(pt, rst) ->
if null rst
then [pt]
else let (h : t) = splitter rst
in (pt ++ h) : t
lazySplitKeepEnd :: S.ByteString -> [S.ByteString] -> [[S.ByteString]]
lazySplitKeepEnd pat = splitter
where
!patLen = S.length pat
breaker = lazyBreak pat
splitter [] = []
splitter strs =
case breaker strs of
(pre, mtch) ->
let (h : t) = if null mtch
then [[]]
else case lsplit patLen mtch of
(pt, rst) -> pt : splitter rst
in (pre ++ h) : t
lazySplitDrop :: S.ByteString -> [S.ByteString] -> [[S.ByteString]]
lazySplitDrop pat = splitter
where
!patLen = S.length pat
breaker = lazyBreak pat
splitter [] = []
splitter strs = splitter' strs
splitter' [] = [[]]
splitter' strs = case breaker strs of
(pre,mtch) ->
pre : case mtch of
[] -> []
_ -> splitter' (ldrop patLen mtch)
lazyRepl :: S.ByteString -> ([S.ByteString] -> [S.ByteString])
-> [S.ByteString] -> [S.ByteString]
lazyRepl pat = replacer
where
!patLen = S.length pat
!patEnd = patLen - 1
!occT = occurs pat
!suffT = suffShifts pat
!maxLen = maxBound - patLen
!pe = patAt patEnd
{-# INLINE patAt #-}
patAt !i = unsafeIndex pat i
{-# INLINE occ #-}
occ !w = unsafeAt occT (fromIntegral w)
{-# INLINE suff #-}
suff !i = unsafeAt suffT i
replacer sub lst =
case lst of
[] -> []
(h:t) ->
if maxLen < S.length h
then error "Overflow in BoyerMoore.lazyRepl"
else seek [] h t 0 patEnd
where
chop _ [] = []
chop !k (!str : rest)
| k < s =
if maxLen < (s - k)
then error "Overflow in BoyerMoore.lazyRepl (chop)"
else seek [] (S.drop k str) rest 0 patEnd
| otherwise = chop (k-s) rest
where
!s = S.length str
seek :: [S.ByteString] -> S.ByteString -> [S.ByteString]
-> Int -> Int -> [S.ByteString]
seek !past !str fut !offset !patPos
| strPos < 0 =
case past of
[] -> error "not enough past!"
(h : t) -> seek t h (str : fut) (offset + S.length h) patPos
| strEnd < strPos =
case fut of
[] -> foldr (flip (.) . (:)) id past [str]
(h : t) ->
let !off' = offset - strLen
(past', !discharge) = keep (-off') (str : past)
in if maxLen < S.length h
then error "Overflow in BoyerMoore.lazyRepl (future)"
else foldr (flip (.) . (:)) id discharge $
seek past' h t off' patPos
| patPos == patEnd = checkEnd strPos
| offset < 0 = matcherN offset patPos
| otherwise = matcherP offset patPos
where
{-# INLINE strAt #-}
strAt !i = unsafeIndex str i
!strLen = S.length str
!strEnd = strLen - 1
!maxOff = strLen - patLen
!strPos = offset + patPos
checkEnd !sI
| strEnd < sI = seek past str fut (sI - patEnd) patEnd
| otherwise =
case strAt sI of
!c | c == pe ->
if sI < patEnd
then (if sI == 0
then seek past str fut (-patEnd) (patEnd - 1)
else matcherN (sI - patEnd) (patEnd - 1))
else matcherP (sI - patEnd) (patEnd - 1)
| otherwise -> checkEnd (sI + patEnd + occ c)
matcherN !off !patI =
case strAt (off + patI) of
!c | c == patAt patI ->
if off + patI == 0
then seek past str fut off (patI - 1)
else matcherN off (patI - 1)
| otherwise ->
let !off' = off + max (suff patI) (patI + occ c)
in if maxOff < off'
then seek past str fut off' patEnd
else checkEnd (off' + patEnd)
matcherP !off !patI =
case strAt (off + patI) of
!c | c == patAt patI ->
if patI == 0
then foldr (flip (.) . (:)) id past $
let pre = if off == 0
then id
else (S.take off str :)
in pre . sub $
let !p = off + patLen
in if p < strLen
then seek [] (S.drop p str) fut 0 patEnd
else chop (p - strLen) fut
else matcherP off (patI - 1)
| otherwise ->
let !off' = off + max (suff patI) (patI + occ c)
in if maxOff < off'
then seek past str fut off' patEnd
else checkEnd (off' + patEnd)