{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_HADDOCK hide, prune #-}
-- |-- Module : Data.ByteString.Search.Internal.KnuthMorrisPratt-- Copyright : Justin Bailey-- Chris Kuklewicz-- Daniel Fischer-- Licence : BSD3-- Maintainer : Daniel Fischer <daniel.is.fischer@googlemail.com>-- Stability : Provisional-- Portability : non-portable (BangPatterns)---- Fast Knuth-Morris-Pratt search of both strict and-- lazy 'S.ByteString' values.---- A description of the algorithm can be found at-- <http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm>.-- Original authors: Justin Bailey (jgbailey at gmail.com) and-- Chris Kuklewicz (haskell at list.mightyreason.com).
module Data.ByteString.Search.Internal.KnuthMorrisPratt ( -- * Overview-- $overview-- * Partial application-- $partial-- * Complexity and Performance-- $complexity-- * Finding substrings-- ** Overlapping
indicesL
, indicesS
-- ** Non-overlapping
, matchLL
, matchLS
, matchSL
, matchSS
) where
import Data.ByteString.Search.Internal.Utils (kmpBorders, strictify)
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Unsafe (unsafeIndex)
import Data.Array.Base (unsafeAt)
--import Data.Array.Unboxed
import Data.Int (Int64)
-- $overview---- This module exports 6 search functions: 'matchLL', 'matchLS',-- 'matchSL', and 'matchSS', which find the indices of all non-overlapping-- occurrences of a pattern in a target string, and the newly added-- 'indicesL' and 'indicesS' which find the indices of-- all (possibly overlapping) occurrences of the pattern in the target-- string. The performance should be the same when the pattern can't-- overlap, but when the pattern occurs often and can have significant-- overlap, the search excluding the overlap is faster.---- In all cases, the list of indices is produced lazily.---- The behaviour of the old @matchXY@ functions for an empty pattern has-- changed, formerly they returned an empty list, now all functions-- return @[0 .. 'length' target]@ for an empty pattern.---- The return type of the @matchXS@ functions changed to @['Int']@, since-- strict ByteStrings are @'Int'@-indexed.---- The trailing @L\/S@ in the function names indicate whether they work-- on lazy or strict ByteStrings. Since all patterns are converted to-- strict ByteStrings for performance reasons, the @matchLX@ add just-- an additional bit of wrapping around the worker in comparison to-- @matchSX@. For the new functions, no such wrapping is provided, you-- have to 'strictify' lazy patterns before feeding them to the searcher.-- The limit on the pattern lengths that the conversion to a strict-- ByteString imposes should be irrelevant in practice.---- The functions searching in lazy ByteStrings don't keep any references-- to chunks already traversed. This means the garbage collector can free-- those chunks early and only a small part of the target string needs to-- be in memory.-- $partial---- These functions can all be usefully partially applied. Given only a-- pattern, the auxiliary data will be computed only once, allowing for-- efficient re-use.-- $complexity---- The preprocessing of the pattern is /O/(@patternLength@) in time and space.-- The time complexity of the searching phase is /O/(@targetLength@) for all-- functions.---- In most cases, these functions are considerably slower than the-- Boyer-Moore variants, performance is close to that of those from-- "Data.ByteString.Search.DFA" resp. "Data.ByteString.Lazy.Search.DFA".-------------------------------------------------------------------------------- Wrappers ---------------------------------------------------------------------------------- | @'indicesL'@ finds all indices of (possibly overlapping)-- occurrences of the pattern in the target string.{-# INLINE indicesL #-}indicesL :: S.ByteString-- ^ Strict pattern
-> L.ByteString-- ^ Lazy target string
-> [Int64] -- ^ Offsets of matchesindicesLpat = search.L.toChunks
where
search = matcherTruepat-- | @'indicesS'@ finds all indices of (possibly overlapping)-- occurrences of the pattern in the target string.{-# INLINE indicesS #-}indicesS :: S.ByteString-- ^ Strict pattern
-> S.ByteString-- ^ Strict target string
-> [Int] -- ^ Offsets of matchesindicesSpat = search. (:[])
where
search = matcherTruepat-- | @'matchLL'@ finds the starting indices of all /non-overlapping/ occurrences-- of the pattern in the target string. It is a simple wrapper around-- 'Data.ByteString.Lazy.Search.KMP.nonOverlappingIndices' strictifying-- the pattern.{-# INLINE matchLL #-}matchLL :: L.ByteString-- ^ Lazy pattern
-> L.ByteString-- ^ Lazy target string
-> [Int64] -- ^ Offsets of matchesmatchLLpat = search.L.toChunks
where
!spat = strictifypatsearch = matcherFalsespat-- | @'matchLS'@ finds the starting indices of all /non-overlapping/ occurrences-- of the pattern in the target string. It is a simple wrapper around-- 'Data.ByteString.Search.KMP.nonOverlappingIndices' strictifying-- the pattern.{-# INLINE matchLS #-}matchLS :: L.ByteString-- ^ Lazy pattern
-> S.ByteString-- ^ Strict target string
-> [Int] -- ^ Offsets of matchesmatchLSpat = search. (:[])
where
!spat = strictifypatsearch = matcherFalsespat-- | @'matchSS'@ finds the starting indices of all /non-overlapping/ occurrences-- of the pattern in the target string. It is an alias for-- 'Data.ByteString.Search.KMP.nonOverlappingIndices'.{-# INLINE matchSS #-}matchSS :: S.ByteString-- ^ Strict pattern
-> S.ByteString-- ^ Strict target string
-> [Int] -- ^ Offsets of matchesmatchSSpat = search. (:[])
where
search = matcherFalsepat-- | @'matchSL'@ finds the starting indices of all /non-overlapping/ occurrences-- of the pattern in the target string. It is an alias for-- 'Data.ByteString.Lazy.Search.KMP.nonOverlappingIndices'.{-# INLINE matchSL #-}matchSL :: S.ByteString-- ^ Strict pattern
-> L.ByteString-- ^ Lazy target string
-> [Int64] -- ^ Offsets of matchesmatchSLpat = search.L.toChunks
where
search = matcherFalsepat-------------------------------------------------------------------------------- Worker --------------------------------------------------------------------------------{-# SPECIALISE matcher :: Bool -> S.ByteString -> [S.ByteString] -> [Int],
Bool -> S.ByteString -> [S.ByteString] -> [Int64] #-}matcher :: Integral a =>Bool -> S.ByteString -> [S.ByteString] -> [a]
matcher _ !pat
| S.nullpat = (0:) .go0
where
go _ [] = []
go !prior (!str:rest) = [prior+fromIntegrali | i <- [1 .. l]]
++goprior'rest
where
!l = S.lengthstr
!prior' = prior+fromIntegrallmatcher !overlappat = searcher00
where
!patLen = S.lengthpat
!bords = kmpBorderspat
!patH = patAt0{-# INLINE misi #-}misi !i = unsafeAtbordsi{-# INLINE patAt #-}patAt !i = unsafeIndexpati
!ami = if overlap then misipatLen else 0searcher _ _ [] = []
searcher !prior !patPos (!str:rest)
| patPos==0 = checkHead0
| otherwise = findMatchpatPos0
where
!strLen = S.lengthstr{-# INLINE strAt #-}strAt !i = unsafeIndexstricheckHead !strI
| strI==strLen =
searcher (prior+fromIntegralstrLen) 0rest
| strAtstrI==patH = findMatch1 (strI+1)
| otherwise = checkHead (strI+1)
findMatch !patI !strI
| patI==patLen =
(prior+fromIntegralstrI-fromIntegralpatLen)
: if ami==0 then checkHeadstrI else findMatchamistrI
| strI==strLen =
searcher (prior+fromIntegralstrLen) patIrest
| otherwise =
if strAtstrI==patAtpatI
then findMatch (patI+1) (strI+1)
else case misipatI of
0 -> checkHeadstrI
(-1) -> checkHead (strI+1)
pI -> findMatchpIstrI