{-# LANGUAGE BangPatterns             #-}
{-# LANGUAGE CPP                      #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE MagicHash                #-}

module Data.HashTable.Internal.CacheLine
  ( cacheLineSearch
  , cacheLineSearch2
  , cacheLineSearch3
  , forwardSearch2
  , forwardSearch3
  , isCacheLineAligned
  , advanceByCacheLineSize
  , prefetchRead
  , prefetchWrite
  , bl_abs#
  , sign#
  , mask#
  , maskw#
  ) where

#if MIN_VERSION_base(4,4,0)
import           Control.Monad.ST (ST)
import           Control.Monad.ST.Unsafe
#else
import           Control.Monad.ST
#endif

import           Data.HashTable.Internal.IntArray (IntArray)
import qualified Data.HashTable.Internal.IntArray as M

#ifndef NO_C_SEARCH
import           Control.Monad
import           Foreign.C.Types
#else
import           Data.Bits
import           Data.Int
import qualified Data.Vector.Unboxed         as U
import           GHC.Int
#endif

import           Data.HashTable.Internal.Utils
import           GHC.Exts

#if __GLASGOW_HASKELL__ >= 707
import GHC.Exts (isTrue#)
#else
isTrue# :: Bool -> Bool
isTrue# = id
#endif


{-# INLINE prefetchRead  #-}
{-# INLINE prefetchWrite #-}
prefetchRead :: IntArray s -> Int -> ST s ()
prefetchWrite :: IntArray s -> Int -> ST s ()

#ifndef NO_C_SEARCH
foreign import ccall unsafe "lineSearch32"
  c_lineSearch32 :: Ptr a -> CInt -> CUInt -> IO CInt

foreign import ccall unsafe "lineSearch64"
  c_lineSearch64 :: Ptr a -> CInt -> CULong -> IO CInt

foreign import ccall unsafe "lineSearch32_2"
  c_lineSearch32_2 :: Ptr a -> CInt -> CUInt -> CUInt -> IO CInt

foreign import ccall unsafe "lineSearch64_2"
  c_lineSearch64_2 :: Ptr a -> CInt -> CULong -> CULong -> IO CInt

foreign import ccall unsafe "lineSearch32_3"
  c_lineSearch32_3 :: Ptr a -> CInt -> CUInt -> CUInt -> CUInt -> IO CInt

foreign import ccall unsafe "lineSearch64_3"
  c_lineSearch64_3 :: Ptr a -> CInt -> CULong -> CULong -> CULong -> IO CInt

foreign import ccall unsafe "forwardSearch32_2"
  c_forwardSearch32_2 :: Ptr a -> CInt -> CInt -> CUInt -> CUInt -> IO CInt

foreign import ccall unsafe "forwardSearch32_3"
  c_forwardSearch32_3 :: Ptr a -> CInt -> CInt -> CUInt -> CUInt -> CUInt
                      -> IO CInt

foreign import ccall unsafe "forwardSearch64_2"
  c_forwardSearch64_2 :: Ptr a -> CInt -> CInt -> CULong -> CULong -> IO CInt

foreign import ccall unsafe "forwardSearch64_3"
  c_forwardSearch64_3 :: Ptr a -> CInt -> CInt -> CULong -> CULong -> CULong
                      -> IO CInt

foreign import ccall unsafe "prefetchCacheLine32_read"
  prefetchCacheLine32_read :: Ptr a -> CInt -> IO ()

foreign import ccall unsafe "prefetchCacheLine64_read"
  prefetchCacheLine64_read :: Ptr a -> CInt -> IO ()

foreign import ccall unsafe "prefetchCacheLine32_write"
  prefetchCacheLine32_write :: Ptr a -> CInt -> IO ()

foreign import ccall unsafe "prefetchCacheLine64_write"
  prefetchCacheLine64_write :: Ptr a -> CInt -> IO ()


fI :: (Num b, Integral a) => a -> b
fI = fromIntegral


prefetchRead a i = unsafeIOToST c
  where
    v   = M.toPtr a
    x   = fI i
    c32 = prefetchCacheLine32_read v x
    c64 = prefetchCacheLine64_read v x
    c   = if wordSize == 32 then c32 else c64


prefetchWrite a i = unsafeIOToST c
  where
    v   = M.toPtr a
    x   = fI i
    c32 = prefetchCacheLine32_write v x
    c64 = prefetchCacheLine64_write v x
    c   = if wordSize == 32 then c32 else c64


{-# INLINE forwardSearch2 #-}
forwardSearch2 :: IntArray s -> Int -> Int -> Int -> Int -> ST s Int
forwardSearch2 !vec !start !end !x1 !x2 =
    liftM fromEnum $! unsafeIOToST c
  where
    c32 = c_forwardSearch32_2 (M.toPtr vec) (fI start) (fI end) (fI x1) (fI x2)
    c64 = c_forwardSearch64_2 (M.toPtr vec) (fI start) (fI end) (fI x1) (fI x2)
    c = if wordSize == 32 then c32 else c64


{-# INLINE forwardSearch3 #-}
forwardSearch3 :: IntArray s -> Int -> Int -> Int -> Int -> Int -> ST s Int
forwardSearch3 !vec !start !end !x1 !x2 !x3 =
    liftM fromEnum $! unsafeIOToST c
  where
    c32 = c_forwardSearch32_3 (M.toPtr vec) (fI start) (fI end)
                              (fI x1) (fI x2) (fI x3)
    c64 = c_forwardSearch64_3 (M.toPtr vec) (fI start) (fI end)
                              (fI x1) (fI x2) (fI x3)
    c = if wordSize == 32 then c32 else c64


{-# INLINE lineSearch #-}
lineSearch :: IntArray s -> Int -> Int -> ST s Int
lineSearch !vec !start !value =
    liftM fromEnum $! unsafeIOToST c
  where
    c32 = c_lineSearch32 (M.toPtr vec) (fI start) (fI value)
    c64 = c_lineSearch64 (M.toPtr vec) (fI start) (fI value)
    c = if wordSize == 32 then c32 else c64

{-# INLINE lineSearch2 #-}
lineSearch2 :: IntArray s -> Int -> Int -> Int -> ST s Int
lineSearch2 !vec !start !x1 !x2 =
    liftM fromEnum $! unsafeIOToST c
  where
    c32 = c_lineSearch32_2 (M.toPtr vec) (fI start) (fI x1) (fI x2)
    c64 = c_lineSearch64_2 (M.toPtr vec) (fI start) (fI x1) (fI x2)
    c = if wordSize == 32 then c32 else c64

{-# INLINE lineSearch3 #-}
lineSearch3 :: IntArray s -> Int -> Int -> Int -> Int -> ST s Int
lineSearch3 !vec !start !x1 !x2 !x3 =
    liftM fromEnum $! unsafeIOToST c
  where
    c32 = c_lineSearch32_3 (M.toPtr vec) (fI start) (fI x1) (fI x2) (fI x3)
    c64 = c_lineSearch64_3 (M.toPtr vec) (fI start) (fI x1) (fI x2) (fI x3)
    c = if wordSize == 32 then c32 else c64
#endif

{-# INLINE advanceByCacheLineSize #-}
advanceByCacheLineSize :: Int -> Int -> Int
advanceByCacheLineSize !(I# start0#) !(I# vecSize#) = out
  where
    !(I# clm#) = cacheLineIntMask
    !clmm#     = not# (int2Word# clm#)
    !start#    = word2Int# (clmm# `and#` int2Word# start0#)
    !(I# nw#)  = numWordsInCacheLine
    !start'#   = start# +# nw#
    !s#        = sign# (vecSize# -# start'# -# 1#)
    !m#        = not# (int2Word# s#)
    !r#        = int2Word# start'# `and#` m#
    !out       = I# (word2Int# r#)


{-# INLINE isCacheLineAligned #-}
isCacheLineAligned :: Int -> Bool
isCacheLineAligned (I# x#) = isTrue# (r# ==# 0#)
  where
    !(I# m#) = cacheLineIntMask
    !mw#     = int2Word# m#
    !w#      = int2Word# x#
    !r#      = word2Int# (mw# `and#` w#)


{-# INLINE sign# #-}
-- | Returns 0 if x is positive, -1 otherwise
sign# :: Int# -> Int#
sign# !x# = x# `uncheckedIShiftRA#` wordSizeMinus1#
  where
    !(I# wordSizeMinus1#) = wordSize-1


{-# INLINE bl_abs# #-}
-- | Abs of an integer, branchless
bl_abs# :: Int# -> Int#
bl_abs# !x# = word2Int# r#
  where
    !m# = sign# x#
    !r# = (int2Word# (m# +# x#)) `xor#` int2Word# m#


{-# INLINE mask# #-}
-- | Returns 0xfff..fff (aka -1) if a# == b#, 0 otherwise.
mask# :: Int# -> Int# -> Int#
mask# !a# !b# = dest#
  where
    !d#    = a# -# b#
    !r#    = bl_abs# d# -# 1#
    !dest# = sign# r#


{- note: this code should be:

mask# :: Int# -> Int# -> Int#
mask# !a# !b# = let !(I# z#) = fromEnum (a# ==# b#)
                    !q#      = negateInt# z#
                in q#

but GHC doesn't properly optimize this as straight-line code at the moment.

-}


{-# INLINE maskw# #-}
maskw# :: Int# -> Int# -> Word#
maskw# !a# !b# = int2Word# (mask# a# b#)


#ifdef NO_C_SEARCH
prefetchRead _ _ = return ()
prefetchWrite _ _ = return ()

{-# INLINE forwardSearch2 #-}
forwardSearch2 :: IntArray s -> Int -> Int -> Int -> Int -> ST s Int
forwardSearch2 !vec !start !end !x1 !x2 = go start end False
  where
    next !i !e !b = let !j = i+1
                    in if j == e
                         then (if b then (-1,e,True) else (0,start,True))
                         else (j,e,b)

    go !i !e !b = do
        h <- M.readArray vec i
        if h == x1 || h == x2
          then return i
          else do
              let (!i',!e',!b') = next i e b
              if (i' < 0) then return (-1) else go i' e' b'


{-# INLINE forwardSearch3 #-}
forwardSearch3 :: IntArray s -> Int -> Int -> Int -> Int -> Int -> ST s Int
forwardSearch3 !vec !start !end !x1 !x2 !x3 = go start end False
  where
    next !i !e !b = let !j = i+1
                    in if j == e
                         then (if b then (-1,e,True) else (0,start,True))
                         else (j,e,b)

    go !i !e !b = do
        h <- M.readArray vec i
        if h == x1 || h == x2 || h == x3
          then return i
          else do
              let (!i',!e',!b') = next i e b
              if (i' < 0) then return (-1) else go i' e' b'


deBruijnBitPositions :: U.Vector Int8
deBruijnBitPositions =
    U.fromList [
          0,   1, 28,  2, 29, 14, 24,  3, 30, 22, 20, 15, 25, 17,  4,  8,
          31, 27, 13, 23, 21, 19, 16,  7, 26, 12, 18,  6, 11,  5, 10,  9
         ]


{-# INLINE firstBitSet# #-}
-- only works with 32-bit values -- ok for us here
firstBitSet# :: Int# -> Int#
firstBitSet# i# = word2Int# ((or# zeroCase# posw#))
  where
    !zeroCase#   = int2Word# (mask# 0# i#)
    !w#          = int2Word# i#
    !iLowest#    = word2Int# (and# w# (int2Word# (negateInt# i#)))
    !idxW#       = uncheckedShiftRL#
                       (narrow32Word# (timesWord# (int2Word# iLowest#)
                                                  (int2Word# 0x077CB531#)))
                       27#
    !idx         = I# (word2Int# idxW#)
    !(I8# pos8#) = U.unsafeIndex deBruijnBitPositions idx
    !posw#       = int2Word# pos8#

#endif


------------------------------------------------------------------------------
-- | Search through a mutable vector for a given int value, cache-line aligned.
-- If the start index is cache-line aligned, and there is more than a
-- cache-line's room between the start index and the end of the vector, we will
-- search the cache-line all at once using an efficient branchless
-- bit-twiddling technique. Otherwise, we will use a typical loop.
--
cacheLineSearch :: IntArray s        -- ^ vector to search
                -> Int               -- ^ start index
                -> Int               -- ^ value to search for
                -> ST s Int          -- ^ dest index where it can be found, or
                                     -- \"-1\" if not found
cacheLineSearch !vec !start !value = do
#ifdef NO_C_SEARCH
    let !vlen  = M.length vec
    let !st1   = vlen - start
    let !nvlen = numWordsInCacheLine - st1
    let adv    = (start + cacheLineIntMask) .&. complement cacheLineIntMask
    let st2    = adv - start


    if nvlen > 0 || not (isCacheLineAligned start)
      then naiveSearch vec start (min st1 st2) value
      else lineSearch vec start value
#else
    lineSearch vec start value
#endif
{-# INLINE cacheLineSearch #-}


#ifdef NO_C_SEARCH
-- | Search through a mutable vector for a given int value. The number of
-- things to search for must be at most the number of things remaining in the
-- vector.
naiveSearch :: IntArray s       -- ^ vector to search
            -> Int              -- ^ start index
            -> Int              -- ^ number of things to search
            -> Int              -- ^ value to search for
            -> ST s Int         -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
naiveSearch !vec !start !nThings !value = go start
  where
    !doneIdx = start + nThings

    go !i | i >= doneIdx = return (-1)
          | otherwise = do
        x <- M.readArray vec i
        if x == value then return i else go (i+1)
{-# INLINE naiveSearch #-}


lineResult# :: Word#    -- ^ mask
            -> Int      -- ^ start value
            -> Int
lineResult# bitmask# (I# start#) = I# (word2Int# rv#)
  where
    !p#   = firstBitSet# (word2Int# bitmask#)
    !mm#  = maskw# p# (-1#)
    !nmm# = not# mm#
    !rv#  = mm# `or#` (nmm# `and#` (int2Word# (start# +# p#)))
{-# INLINE lineResult# #-}


lineSearch :: IntArray s        -- ^ vector to search
           -> Int               -- ^ start index
           -> Int               -- ^ value to search for
           -> ST s Int          -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
lineSearch | wordSize == 32 = lineSearch32
           | otherwise      = lineSearch64
{-# INLINE lineSearch #-}


lineSearch64 :: IntArray s        -- ^ vector to search
             -> Int               -- ^ start index
             -> Int               -- ^ value to search for
             -> ST s Int          -- ^ dest index where it can be found, or
                                  -- \"-1\" if not found
lineSearch64 !vec !start !(I# v#) = do
    (I# x1#) <- M.readArray vec $! start + 0
    let !p1# = maskw# x1# v# `and#` int2Word# 0x1#

    (I# x2#) <- M.readArray vec $! start + 1
    let !p2# = p1# `or#` (maskw# x2# v# `and#` int2Word# 0x2#)

    (I# x3#) <- M.readArray vec $! start + 2
    let !p3# = p2# `or#` (maskw# x3# v# `and#` int2Word# 0x4#)

    (I# x4#) <- M.readArray vec $! start + 3
    let !p4# = p3# `or#` (maskw# x4# v# `and#` int2Word# 0x8#)

    (I# x5#) <- M.readArray vec $! start + 4
    let !p5# = p4# `or#` (maskw# x5# v# `and#` int2Word# 0x10#)

    (I# x6#) <- M.readArray vec $! start + 5
    let !p6# = p5# `or#` (maskw# x6# v# `and#` int2Word# 0x20#)

    (I# x7#) <- M.readArray vec $! start + 6
    let !p7# = p6# `or#` (maskw# x7# v# `and#` int2Word# 0x40#)

    (I# x8#) <- M.readArray vec $! start + 7
    let !p8# = p7# `or#` (maskw# x8# v# `and#` int2Word# 0x80#)

    return $! lineResult# p8# start
{-# INLINE lineSearch64 #-}



lineSearch32 :: IntArray s        -- ^ vector to search
             -> Int               -- ^ start index
             -> Int               -- ^ value to search for
             -> ST s Int          -- ^ dest index where it can be found, or
                                  -- \"-1\" if not found
lineSearch32 !vec !start !(I# v#) = do
    (I# x1#) <- M.readArray vec $! start + 0
    let !p1# = maskw# x1# v# `and#` int2Word# 0x1#

    (I# x2#) <- M.readArray vec $! start + 1
    let !p2# = p1# `or#` (maskw# x2# v# `and#` int2Word# 0x2#)

    (I# x3#) <- M.readArray vec $! start + 2
    let !p3# = p2# `or#` (maskw# x3# v# `and#` int2Word# 0x4#)

    (I# x4#) <- M.readArray vec $! start + 3
    let !p4# = p3# `or#` (maskw# x4# v# `and#` int2Word# 0x8#)

    (I# x5#) <- M.readArray vec $! start + 4
    let !p5# = p4# `or#` (maskw# x5# v# `and#` int2Word# 0x10#)

    (I# x6#) <- M.readArray vec $! start + 5
    let !p6# = p5# `or#` (maskw# x6# v# `and#` int2Word# 0x20#)

    (I# x7#) <- M.readArray vec $! start + 6
    let !p7# = p6# `or#` (maskw# x7# v# `and#` int2Word# 0x40#)

    (I# x8#) <- M.readArray vec $! start + 7
    let !p8# = p7# `or#` (maskw# x8# v# `and#` int2Word# 0x80#)

    (I# x9#) <- M.readArray vec $! start + 8
    let !p9# = p8# `or#` (maskw# x9# v# `and#` int2Word# 0x100#)

    (I# x10#) <- M.readArray vec $! start + 9
    let !p10# = p9# `or#` (maskw# x10# v# `and#` int2Word# 0x200#)

    (I# x11#) <- M.readArray vec $! start + 10
    let !p11# = p10# `or#` (maskw# x11# v# `and#` int2Word# 0x400#)

    (I# x12#) <- M.readArray vec $! start + 11
    let !p12# = p11# `or#` (maskw# x12# v# `and#` int2Word# 0x800#)

    (I# x13#) <- M.readArray vec $! start + 12
    let !p13# = p12# `or#` (maskw# x13# v# `and#` int2Word# 0x1000#)

    (I# x14#) <- M.readArray vec $! start + 13
    let !p14# = p13# `or#` (maskw# x14# v# `and#` int2Word# 0x2000#)

    (I# x15#) <- M.readArray vec $! start + 14
    let !p15# = p14# `or#` (maskw# x15# v# `and#` int2Word# 0x4000#)

    (I# x16#) <- M.readArray vec $! start + 15
    let !p16# = p15# `or#` (maskw# x16# v# `and#` int2Word# 0x8000#)

    return $! lineResult# p16# start
{-# INLINE lineSearch32 #-}

#endif

------------------------------------------------------------------------------
-- | Search through a mutable vector for one of two given int values,
-- cache-line aligned.  If the start index is cache-line aligned, and there is
-- more than a cache-line's room between the start index and the end of the
-- vector, we will search the cache-line all at once using an efficient
-- branchless bit-twiddling technique. Otherwise, we will use a typical loop.
--
cacheLineSearch2 :: IntArray s        -- ^ vector to search
                 -> Int               -- ^ start index
                 -> Int               -- ^ value to search for
                 -> Int               -- ^ value 2 to search for
                 -> ST s Int          -- ^ dest index where it can be found, or
                                     -- \"-1\" if not found
cacheLineSearch2 !vec !start !value !value2 = do
#ifdef NO_C_SEARCH
    let !vlen  = M.length vec
    let !st1   = vlen - start
    let !nvlen = numWordsInCacheLine - st1
    let adv    = (start + cacheLineIntMask) .&. complement cacheLineIntMask
    let st2    = adv - start

    if nvlen > 0 || not (isCacheLineAligned start)
      then naiveSearch2 vec start (min st1 st2) value value2
      else lineSearch2 vec start value value2
#else
    lineSearch2 vec start value value2
#endif
{-# INLINE cacheLineSearch2 #-}


#ifdef NO_C_SEARCH

naiveSearch2 :: IntArray s       -- ^ vector to search
             -> Int              -- ^ start index
             -> Int              -- ^ number of things to search
             -> Int              -- ^ value to search for
             -> Int              -- ^ value 2 to search for
             -> ST s Int         -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
naiveSearch2 !vec !start !nThings !value1 !value2 = go start
  where
    !doneIdx = start + nThings

    go !i | i >= doneIdx = return (-1)
          | otherwise = do
        x <- M.readArray vec i
        if x == value1 || x == value2 then return i else go (i+1)
{-# INLINE naiveSearch2 #-}


lineSearch2 :: IntArray s        -- ^ vector to search
            -> Int               -- ^ start index
            -> Int               -- ^ value to search for
            -> Int               -- ^ value 2 to search for
            -> ST s Int          -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
lineSearch2 | wordSize == 32 = lineSearch32_2
            | otherwise      = lineSearch64_2



lineSearch64_2 :: IntArray s        -- ^ vector to search
               -> Int               -- ^ start index
               -> Int               -- ^ value to search for
               -> Int               -- ^ value 2 to search for
               -> ST s Int          -- ^ dest index where it can be found, or
                                    -- \"-1\" if not found
lineSearch64_2 !vec !start !(I# v#) !(I# v2#) = do
    (I# x1#) <- M.readArray vec $! start + 0
    let !p1# = (maskw# x1# v# `or#` maskw# x1# v2#) `and#` int2Word# 0x1#

    (I# x2#) <- M.readArray vec $! start + 1
    let !p2# = p1# `or#` ((maskw# x2# v# `or#` maskw# x2# v2#)
                          `and#` int2Word# 0x2#)

    (I# x3#) <- M.readArray vec $! start + 2
    let !p3# = p2# `or#` ((maskw# x3# v# `or#` maskw# x3# v2#)
                          `and#` int2Word# 0x4#)

    (I# x4#) <- M.readArray vec $! start + 3
    let !p4# = p3# `or#` ((maskw# x4# v# `or#` maskw# x4# v2#)
                          `and#` int2Word# 0x8#)

    (I# x5#) <- M.readArray vec $! start + 4
    let !p5# = p4# `or#` ((maskw# x5# v# `or#` maskw# x5# v2#)
                          `and#` int2Word# 0x10#)

    (I# x6#) <- M.readArray vec $! start + 5
    let !p6# = p5# `or#` ((maskw# x6# v# `or#` maskw# x6# v2#)
                          `and#` int2Word# 0x20#)

    (I# x7#) <- M.readArray vec $! start + 6
    let !p7# = p6# `or#` ((maskw# x7# v# `or#` maskw# x7# v2#)
                          `and#` int2Word# 0x40#)

    (I# x8#) <- M.readArray vec $! start + 7
    let !p8# = p7# `or#` ((maskw# x8# v# `or#` maskw# x8# v2#)
                          `and#` int2Word# 0x80#)

    return $! lineResult# p8# start
{-# INLINE lineSearch64_2 #-}


lineSearch32_2 :: IntArray s        -- ^ vector to search
               -> Int               -- ^ start index
               -> Int               -- ^ value to search for
               -> Int               -- ^ value 2 to search for
               -> ST s Int          -- ^ dest index where it can be found, or
                                    -- \"-1\" if not found
lineSearch32_2 !vec !start !(I# v#) !(I# v2#) = do
    (I# x1#) <- M.readArray vec $! start + 0
    let !p1# = (maskw# x1# v# `or#` maskw# x1# v2#) `and#` int2Word# 0x1#

    (I# x2#) <- M.readArray vec $! start + 1
    let !p2# = p1# `or#` ((maskw# x2# v# `or#` maskw# x2# v2#)
                          `and#` int2Word# 0x2#)

    (I# x3#) <- M.readArray vec $! start + 2
    let !p3# = p2# `or#` ((maskw# x3# v# `or#` maskw# x3# v2#)
                          `and#` int2Word# 0x4#)

    (I# x4#) <- M.readArray vec $! start + 3
    let !p4# = p3# `or#` ((maskw# x4# v# `or#` maskw# x4# v2#)
                          `and#` int2Word# 0x8#)

    (I# x5#) <- M.readArray vec $! start + 4
    let !p5# = p4# `or#` ((maskw# x5# v# `or#` maskw# x5# v2#)
                          `and#` int2Word# 0x10#)

    (I# x6#) <- M.readArray vec $! start + 5
    let !p6# = p5# `or#` ((maskw# x6# v# `or#` maskw# x6# v2#)
                          `and#` int2Word# 0x20#)

    (I# x7#) <- M.readArray vec $! start + 6
    let !p7# = p6# `or#` ((maskw# x7# v# `or#` maskw# x7# v2#)
                          `and#` int2Word# 0x40#)

    (I# x8#) <- M.readArray vec $! start + 7
    let !p8# = p7# `or#` ((maskw# x8# v# `or#` maskw# x8# v2#)
                          `and#` int2Word# 0x80#)

    (I# x9#) <- M.readArray vec $! start + 8
    let !p9# = p8# `or#` ((maskw# x9# v# `or#` maskw# x9# v2#)
                          `and#` int2Word# 0x100#)

    (I# x10#) <- M.readArray vec $! start + 9
    let !p10# = p9# `or#` ((maskw# x10# v# `or#` maskw# x10# v2#)
                           `and#` int2Word# 0x200#)

    (I# x11#) <- M.readArray vec $! start + 10
    let !p11# = p10# `or#` ((maskw# x11# v# `or#` maskw# x11# v2#)
                            `and#` int2Word# 0x400#)

    (I# x12#) <- M.readArray vec $! start + 11
    let !p12# = p11# `or#` ((maskw# x12# v# `or#` maskw# x12# v2#)
                            `and#` int2Word# 0x800#)

    (I# x13#) <- M.readArray vec $! start + 12
    let !p13# = p12# `or#` ((maskw# x13# v# `or#` maskw# x13# v2#)
                            `and#` int2Word# 0x1000#)

    (I# x14#) <- M.readArray vec $! start + 13
    let !p14# = p13# `or#` ((maskw# x14# v# `or#` maskw# x14# v2#)
                            `and#` int2Word# 0x2000#)

    (I# x15#) <- M.readArray vec $! start + 14
    let !p15# = p14# `or#` ((maskw# x15# v# `or#` maskw# x15# v2#)
                            `and#` int2Word# 0x4000#)

    (I# x16#) <- M.readArray vec $! start + 15
    let !p16# = p15# `or#` ((maskw# x16# v# `or#` maskw# x16# v2#)
                            `and#` int2Word# 0x8000#)

    return $! lineResult# p16# start
{-# INLINE lineSearch32_2 #-}

#endif


------------------------------------------------------------------------------
-- | Search through a mutable vector for one of three given int values,
-- cache-line aligned.  If the start index is cache-line aligned, and there is
-- more than a cache-line's room between the start index and the end of the
-- vector, we will search the cache-line all at once using an efficient
-- branchless bit-twiddling technique. Otherwise, we will use a typical loop.
--
cacheLineSearch3 :: IntArray s        -- ^ vector to search
                 -> Int               -- ^ start index
                 -> Int               -- ^ value to search for
                 -> Int               -- ^ value 2 to search for
                 -> Int               -- ^ value 3 to search for
                 -> ST s Int          -- ^ dest index where it can be found, or
                                     -- \"-1\" if not found
cacheLineSearch3 !vec !start !value !value2 !value3 = do
#ifdef NO_C_SEARCH
    let !vlen  = M.length vec
    let !st1   = vlen - start
    let !nvlen = numWordsInCacheLine - st1
    let adv    = (start + cacheLineIntMask) .&. complement cacheLineIntMask
    let st2    = adv - start

    if nvlen > 0 || not (isCacheLineAligned start)
      then naiveSearch3 vec start (min st1 st2) value value2 value3
      else lineSearch3 vec start value value2 value3
#else
    lineSearch3 vec start value value2 value3
#endif
{-# INLINE cacheLineSearch3 #-}


#ifdef NO_C_SEARCH

naiveSearch3 :: IntArray s       -- ^ vector to search
             -> Int              -- ^ start index
             -> Int              -- ^ number of things to search
             -> Int              -- ^ value to search for
             -> Int              -- ^ value 2 to search for
             -> Int              -- ^ value 3 to search for
             -> ST s Int         -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
naiveSearch3 !vec !start !nThings !value1 !value2 !value3 = go start
  where
    !doneIdx = start + nThings

    go !i | i >= doneIdx = return (-1)
          | otherwise = do
        x <- M.readArray vec i
        if x == value1 || x == value2 || x == value3
          then return i
          else go (i+1)
{-# INLINE naiveSearch3 #-}


lineSearch3 :: IntArray s        -- ^ vector to search
            -> Int               -- ^ start index
            -> Int               -- ^ value to search for
            -> Int               -- ^ value 2 to search for
            -> Int               -- ^ value 3 to search for
            -> ST s Int          -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
lineSearch3 | wordSize == 32 = lineSearch32_3
            | otherwise      = lineSearch64_3



lineSearch64_3 :: IntArray s        -- ^ vector to search
               -> Int               -- ^ start index
               -> Int               -- ^ value to search for
               -> Int               -- ^ value 2 to search for
               -> Int               -- ^ value 3 to search for
               -> ST s Int          -- ^ dest index where it can be found, or
                                    -- \"-1\" if not found
lineSearch64_3 !vec !start !(I# v#) !(I# v2#) !(I# v3#) = do
    (I# x1#) <- M.readArray vec $! start + 0
    let !p1# = (maskw# x1# v# `or#` maskw# x1# v2# `or#` maskw# x1# v3#)
               `and#` int2Word# 0x1#

    (I# x2#) <- M.readArray vec $! start + 1
    let !p2# = p1# `or#`
               ((maskw# x2# v# `or#` maskw# x2# v2# `or#` maskw# x2# v3#)
                `and#` int2Word# 0x2#)

    (I# x3#) <- M.readArray vec $! start + 2
    let !p3# = p2# `or#`
               ((maskw# x3# v# `or#` maskw# x3# v2# `or#` maskw# x3# v3#)
                `and#` int2Word# 0x4#)

    (I# x4#) <- M.readArray vec $! start + 3
    let !p4# = p3# `or#`
               ((maskw# x4# v# `or#` maskw# x4# v2# `or#` maskw# x4# v3#)
                `and#` int2Word# 0x8#)

    (I# x5#) <- M.readArray vec $! start + 4
    let !p5# = p4# `or#`
               ((maskw# x5# v# `or#` maskw# x5# v2# `or#` maskw# x5# v3#)
                `and#` int2Word# 0x10#)

    (I# x6#) <- M.readArray vec $! start + 5
    let !p6# = p5# `or#`
               ((maskw# x6# v# `or#` maskw# x6# v2# `or#` maskw# x6# v3#)
                `and#` int2Word# 0x20#)

    (I# x7#) <- M.readArray vec $! start + 6
    let !p7# = p6# `or#`
               ((maskw# x7# v# `or#` maskw# x7# v2# `or#` maskw# x7# v3#)
                `and#` int2Word# 0x40#)

    (I# x8#) <- M.readArray vec $! start + 7
    let !p8# = p7# `or#`
               ((maskw# x8# v# `or#` maskw# x8# v2# `or#` maskw# x8# v3#)
                `and#` int2Word# 0x80#)

    return $! lineResult# p8# start
{-# INLINE lineSearch64_3 #-}


lineSearch32_3 :: IntArray s        -- ^ vector to search
               -> Int               -- ^ start index
               -> Int               -- ^ value to search for
               -> Int               -- ^ value 2 to search for
               -> Int               -- ^ value 3 to search for
               -> ST s Int          -- ^ dest index where it can be found, or
                                    -- \"-1\" if not found
lineSearch32_3 !vec !start !(I# v#) !(I# v2#) !(I# v3#) = do
    (I# x1#) <- M.readArray vec $! start + 0
    let !p1# = (maskw# x1# v# `or#` maskw# x1# v2# `or#` maskw# x1# v3#)
               `and#` int2Word# 0x1#

    (I# x2#) <- M.readArray vec $! start + 1
    let !p2# = p1# `or#`
               ((maskw# x2# v# `or#` maskw# x2# v2# `or#` maskw# x2# v3#)
                `and#` int2Word# 0x2#)

    (I# x3#) <- M.readArray vec $! start + 2
    let !p3# = p2# `or#`
               ((maskw# x3# v# `or#` maskw# x3# v2# `or#` maskw# x3# v3#)
                `and#` int2Word# 0x4#)

    (I# x4#) <- M.readArray vec $! start + 3
    let !p4# = p3# `or#`
               ((maskw# x4# v# `or#` maskw# x4# v2# `or#` maskw# x4# v3#)
                `and#` int2Word# 0x8#)

    (I# x5#) <- M.readArray vec $! start + 4
    let !p5# = p4# `or#`
               ((maskw# x5# v# `or#` maskw# x5# v2# `or#` maskw# x5# v3#)
                `and#` int2Word# 0x10#)

    (I# x6#) <- M.readArray vec $! start + 5
    let !p6# = p5# `or#`
               ((maskw# x6# v# `or#` maskw# x6# v2# `or#` maskw# x6# v3#)
                `and#` int2Word# 0x20#)

    (I# x7#) <- M.readArray vec $! start + 6
    let !p7# = p6# `or#`
               ((maskw# x7# v# `or#` maskw# x7# v2# `or#` maskw# x7# v3#)
                `and#` int2Word# 0x40#)

    (I# x8#) <- M.readArray vec $! start + 7
    let !p8# = p7# `or#`
               ((maskw# x8# v# `or#` maskw# x8# v2# `or#` maskw# x8# v3#)
                `and#` int2Word# 0x80#)

    (I# x9#) <- M.readArray vec $! start + 8
    let !p9# = p8# `or#`
               ((maskw# x9# v# `or#` maskw# x9# v2# `or#` maskw# x9# v3#)
                `and#` int2Word# 0x100#)

    (I# x10#) <- M.readArray vec $! start + 9
    let !p10# = p9# `or#`
                ((maskw# x10# v# `or#` maskw# x10# v2# `or#` maskw# x10# v3#)
                 `and#` int2Word# 0x200#)

    (I# x11#) <- M.readArray vec $! start + 10
    let !p11# = p10# `or#`
                ((maskw# x11# v# `or#` maskw# x11# v2# `or#` maskw# x11# v3#)
                 `and#` int2Word# 0x400#)

    (I# x12#) <- M.readArray vec $! start + 11
    let !p12# = p11# `or#`
                ((maskw# x12# v# `or#` maskw# x12# v2# `or#` maskw# x12# v3#)
                 `and#` int2Word# 0x800#)

    (I# x13#) <- M.readArray vec $! start + 12
    let !p13# = p12# `or#`
                ((maskw# x13# v# `or#` maskw# x13# v2# `or#` maskw# x13# v3#)
                 `and#` int2Word# 0x1000#)

    (I# x14#) <- M.readArray vec $! start + 13
    let !p14# = p13# `or#`
                ((maskw# x14# v# `or#` maskw# x14# v2# `or#` maskw# x14# v3#)
                 `and#` int2Word# 0x2000#)

    (I# x15#) <- M.readArray vec $! start + 14
    let !p15# = p14# `or#`
                ((maskw# x15# v# `or#` maskw# x15# v2# `or#` maskw# x15# v3#)
                 `and#` int2Word# 0x4000#)

    (I# x16#) <- M.readArray vec $! start + 15
    let !p16# = p15# `or#`
                ((maskw# x16# v# `or#` maskw# x16# v2# `or#` maskw# x16# v3#)
                 `and#` int2Word# 0x8000#)

    return $! lineResult# p16# start
{-# INLINE lineSearch32_3 #-}

#endif