{-# LANGUAGE BangPatterns #-}
module Data.ByteString.Lazy.Search.DFA (
indices
, nonOverlappingIndices
, breakOn
, breakAfter
, breakFindAfter
, replace
, split
, splitKeepEnd
, splitKeepFront
) where
import Data.ByteString.Search.Internal.Utils (automaton, keep, ldrop, lsplit)
import Data.ByteString.Search.Substitution
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Internal as LI
import Data.ByteString.Unsafe (unsafeIndex)
import Data.Array.Base (unsafeAt)
import Data.Bits
import Data.Int (Int64)
{-# INLINE indices #-}
indices :: S.ByteString
-> L.ByteString
-> [Int64]
indices !pat = lazySearcher True pat . L.toChunks
{-# INLINE nonOverlappingIndices #-}
nonOverlappingIndices :: S.ByteString
-> L.ByteString
-> [Int64]
nonOverlappingIndices !pat = lazySearcher False pat . L.toChunks
breakOn :: S.ByteString
-> L.ByteString
-> (L.ByteString, L.ByteString)
breakOn pat = breaker . L.toChunks
where
lbrk = lazyBreaker True pat
breaker strs = let (f, b) = lbrk strs
in (L.fromChunks f, L.fromChunks b)
breakAfter :: S.ByteString
-> L.ByteString
-> (L.ByteString, L.ByteString)
breakAfter pat = breaker . L.toChunks
where
lbrk = lazyBreaker False pat
breaker strs = let (f, b) = lbrk strs
in (L.fromChunks f, L.fromChunks b)
breakFindAfter :: S.ByteString
-> L.ByteString
-> ((L.ByteString, L.ByteString), Bool)
breakFindAfter pat
| S.null pat = \str -> ((L.empty, str), True)
breakFindAfter pat = breaker . L.toChunks
where
!patLen = S.length pat
lbrk = lazyBreaker True pat
breaker strs = let (f, b) = lbrk strs
(f1, b1) = lsplit patLen b
mbpat = L.fromChunks f1
in ((foldr LI.chunk mbpat f, L.fromChunks b1), not (null b))
replace :: Substitution rep
=> S.ByteString
-> rep
-> L.ByteString
-> L.ByteString
replace pat
| S.null pat = \sub -> prependCycle sub
| otherwise =
let !patLen = S.length pat
breaker = lazyBreaker True pat
repl subst strs
| null strs = []
| otherwise =
let (pre, mtch) = breaker strs
in pre ++ case mtch of
[] -> []
_ -> subst (repl subst (ldrop patLen mtch))
in \sub -> let {-# NOINLINE subst #-}
!subst = substitution sub
repl1 = repl subst
in L.fromChunks . repl1 . L.toChunks
split :: S.ByteString
-> L.ByteString
-> [L.ByteString]
split pat
| S.null pat = const (repeat L.empty)
split pat = map L.fromChunks . splitter . L.toChunks
where
!patLen = S.length pat
breaker = lazyBreaker True pat
splitter strs
| null strs = []
| otherwise = splitter' strs
splitter' strs
| null strs = [[]]
| otherwise =
case breaker strs of
(pre, mtch) ->
pre : case mtch of
[] -> []
_ -> splitter' (ldrop patLen mtch)
splitKeepEnd :: S.ByteString
-> L.ByteString
-> [L.ByteString]
splitKeepEnd pat
| S.null pat = const (repeat L.empty)
splitKeepEnd pat = map L.fromChunks . splitter . L.toChunks
where
breaker = lazyBreaker False pat
splitter [] = []
splitter strs =
case breaker strs of
(pre, mtch) -> pre : splitter mtch
splitKeepFront :: S.ByteString
-> L.ByteString
-> [L.ByteString]
splitKeepFront pat
| S.null pat = const (repeat L.empty)
splitKeepFront pat = map L.fromChunks . splitter . L.toChunks
where
!patLen = S.length pat
breaker = lazyBreaker True pat
splitter strs = case splitter' strs of
([] : rst) -> rst
other -> other
splitter' [] = []
splitter' strs =
case breaker strs of
(pre, mtch) ->
pre : case mtch of
[] -> []
_ -> case lsplit patLen mtch of
(pt, rst) ->
if null rst
then [pt]
else let (h : t) = splitter' rst
in (pt ++ h) : t
lazySearcher :: Bool -> S.ByteString -> [S.ByteString] -> [Int64]
lazySearcher _ !pat
| S.null pat =
let zgo _ [] = []
zgo !prior (!str : rest) =
let !l = S.length str
!prior' = prior + fromIntegral l
in [prior + fromIntegral i | i <- [1 .. l]] ++ zgo prior' rest
in (0:) . zgo 0
| S.length pat == 1 =
let !w = S.head pat
ixes = S.elemIndices w
go _ [] = []
go !prior (!str : rest)
= let !prior' = prior + fromIntegral (S.length str)
in map ((+ prior) . fromIntegral) (ixes str) ++ go prior' rest
in go 0
lazySearcher !overlap pat = search 0 0
where
!patLen = S.length pat
!auto = automaton pat
!p0 = unsafeIndex pat 0
!ams = if overlap then patLen else 0
search _ _ [] = []
search !prior st (!str:rest) = match st 0
where
!strLen = S.length str
{-# INLINE strAt #-}
strAt :: Int -> Int
strAt i = fromIntegral (str `unsafeIndex` i)
match 0 !idx
| idx == strLen = search (prior + fromIntegral strLen) 0 rest
| unsafeIndex str idx == p0 = match 1 (idx + 1)
| otherwise = match 0 (idx + 1)
match state idx
| idx == strLen = search (prior + fromIntegral strLen) state rest
| otherwise =
let nstate = unsafeAt auto ((state `shiftL` 8) + strAt idx)
!nxtIdx = idx + 1
in if nstate == patLen
then (prior + fromIntegral (nxtIdx - patLen)) :
match ams nxtIdx
else match nstate nxtIdx
lazyBreaker :: Bool -> S.ByteString -> [S.ByteString]
-> ([S.ByteString], [S.ByteString])
lazyBreaker before pat
| S.null pat = \strs -> ([], strs)
| S.length pat == 1 =
let !w = S.head pat
!a = if before then 0 else 1
ixes = S.elemIndices w
scan [] = ([], [])
scan (!str:rest) =
let !strLen = S.length str
in case ixes str of
[] -> let (fr, bk) = scan rest in (str : fr, bk)
(i:_) -> let !j = i + a
in if j == strLen
then ([str],rest)
else ([S.take j str], S.drop j str : rest)
in scan
lazyBreaker !before pat = bscan [] 0
where
!patLen = S.length pat
!auto = automaton pat
!p0 = unsafeIndex pat 0
bscan _ _ [] = ([], [])
bscan !past !sta (!str:rest) = match sta 0
where
!strLen = S.length str
{-# INLINE strAt #-}
strAt :: Int -> Int
strAt i = fromIntegral (str `unsafeIndex` i)
match 0 idx
| idx == strLen =
let (fr, bk) = bscan [] 0 rest
in (foldr (flip (.) . (:)) id past (str:fr), bk)
| unsafeIndex str idx == p0 = match 1 (idx + 1)
| otherwise = match 0 (idx + 1)
match state idx
| idx == strLen =
let (kp, !rl) = if before
then keep state (str:past)
else ([], str:past)
(fr, bk) = bscan kp state rest
in (foldr (flip (.) . (:)) id rl fr, bk)
| otherwise =
let !nstate = unsafeAt auto ((state `shiftL` 8) + strAt idx)
!nxtIdx = idx + 1
in if nstate == patLen
then case if before then nxtIdx - patLen else nxtIdx of
0 -> (foldr (flip (.) . (:)) id past [], str:rest)
stIx | stIx < 0 -> rgo (-stIx) (str:rest) past
| stIx == strLen ->
(foldr (flip (.) . (:)) id past [str],rest)
| otherwise ->
(foldr (flip (.) . (:)) id past
[S.take stIx str], S.drop stIx str : rest)
else match nstate nxtIdx
{-# INLINE rgo #-}
rgo :: Int -> [S.ByteString] -> [S.ByteString]
-> ([S.ByteString], [S.ByteString])
rgo !kp acc (!str:more)
| sl == kp = (reverse more, str:acc)
| sl < kp = rgo (kp - sl) (str:acc) more
| otherwise = case S.splitAt (sl - kp) str of
(fr, bk) ->
(foldr (flip (.) . (:)) id more [fr], bk:acc)
where
!sl = S.length str
rgo _ _ [] = error "Not enough past!"