{-# OPTIONS -fffi #-} ----------------------------------------------------------------------------- -- | -- Module : FastPackedString -- Copyright : (c) The University of Glasgow 2001, David Roundy 2003 -- License : GPL (I'm happy to also license this file BSD style but don't -- want to bother distributing two license files with darcs. -- -- Maintainer : droundy@abridgegame.org -- Stability : experimental -- Portability : portable -- -- An efficient implementation of strings. -- ----------------------------------------------------------------------------- -- Original GHC implementation by Bryan O\'Sullivan, -- rewritten to use UArray by Simon Marlow. -- rewritten to support slices and use ForeignPtr by David Roundy module FastPackedString ( -- * The @PackedString@ type PackedString, -- abstract, instances: Eq, Ord, Show, Typeable unsafeWithInternals, -- :: PackedString -> (Ptr Word8 -> Int -> IO a) -> IO a -- * Converting to and from @PackedString@s generatePS, -- :: Int -> (Ptr Word8 -> Int -> IO Int) -> IO PackedString packString, -- :: String -> PackedString packWords, -- :: [Word8] -> PackedString unpackPS, -- :: PackedString -> String unpackWords, -- :: PackedString -> [Word8] unpackPSfromUTF8, -- :: PackedString -> String -- * I\/O with @PackedString@s hPutPS, -- :: Handle -> PackedString -> IO () hGetPS, -- :: Handle -> Int -> IO PackedString hGetContentsPS, -- :: Handle -> IO PackedString readFilePS, -- :: FilePath -> IO PackedString writeFilePS, -- :: FilePath -> PackedString -> IO () gzReadFilePS,-- :: FilePath -> IO PackedString mmapFilePS, -- :: FilePath -> IO PackedString -- * List-like manipulation functions nilPS, -- :: PackedString consPS, -- :: Char -> PackedString -> PackedString headPS, -- :: PackedString -> Char tailPS, -- :: PackedString -> PackedString lastPS, -- :: PackedString -> Char nullPS, -- :: PackedString -> Bool appendPS, -- :: PackedString -> PackedString -> PackedString lengthPS, -- :: PackedString -> Int indexPS, -- :: PackedString -> Int -> Char indexPSW, -- :: PackedString -> Int -> Word8 mapPS, -- :: (Char -> Char) -> PackedString -> PackedString --filterPS, -- :: (Char -> Bool) -> PackedString -> PackedString reversePS, -- :: PackedString -> PackedString concatPS, -- :: [PackedString] -> PackedString concatLenPS, -- :: Int -> [PackedString] -> PackedString elemPS, -- :: Char -> PackedString -> Bool takePS, -- :: Int -> PackedString -> PackedString dropPS, -- :: Int -> PackedString -> PackedString splitAtPS, -- :: Int -> PackedString -> (PackedString, PackedString) foldlPS, -- :: (a -> Char -> a) -> a -> PackedString -> a foldrPS, -- :: (Char -> a -> a) -> a -> PackedString -> a anyPS, takeWhilePS, -- :: (Char -> Bool) -> PackedString -> PackedString dropWhilePS, -- :: (Char -> Bool) -> PackedString -> PackedString dropWhitePS, -- :: PackedString -> PackedString breakWhitePS,-- :: PackedString -> Maybe (PackedString,PackedString) spanPS, -- :: (Char -> Bool) -> PackedString -> (PackedString, PackedString) spanEndPS, -- :: (Char -> Bool) -> PackedString -> (PackedString, PackedString) breakPS, -- :: (Char -> Bool) -> PackedString -> (PackedString, PackedString) breakOnPS, -- :: Char -> PackedString -> (PackedString, PackedString) linesPS, -- :: PackedString -> [PackedString] unlinesPS, -- :: [PackedString] -> PackedString findPS, wordsPS, -- :: PackedString -> [PackedString] splitPS, -- :: Char -> PackedString -> [PackedString] splitWithPS, -- :: (Char -> Bool) -> PackedString -> [PackedString] -- joinPS, -- :: PackedString -> [PackedString] -> PackedString breakFirstPS,-- :: Char -> PackedString -> Maybe (PackedString,PackedString) breakLastPS, -- :: Char -> PackedString -> Maybe (PackedString,PackedString) readIntPS, -- :: PackedString -> Maybe (Int, PackedString) mylexPS, -- :: PackedString -> Maybe (PackedString, PackedString) is_funky, -- :: PackedString -> Bool fromHex2PS, -- :: PackedString -> PackedString fromPS2Hex, -- :: PackedString -> PackedString betweenLinesPS,-- :: PackedString -> PackedString -> PackedString -> Maybe (PackedString) ) where import IO ( Handle, hClose, hFileSize, IOMode(ReadMode,WriteMode), hSeek, SeekMode(SeekFromEnd), hGetChar ) import Autoconf ( use_mmap, use_hideous_malloc_hack ) import System.Mem ( performGC ) import Foreign.Storable ( peekElemOff, peek, poke ) import Ptr ( nullPtr, plusPtr, minusPtr, Ptr ) import Foreign.Marshal.Alloc ( free ) import Foreign.Marshal.Array ( pokeArray, mallocArray, reallocArray, peekArray, advancePtr ) import Foreign.Marshal.Utils ( with ) import Foreign.C.String import Foreign.C.Types ( CLong, CInt, CSize ) import Data.Char import Data.Word import Monad ( liftM, when ) import System.IO.Unsafe ( unsafePerformIO ) import System.IO ( hPutBuf, hGetBuf ) import System.IO ( openBinaryFile ) import Foreign.ForeignPtr( ForeignPtr, withForeignPtr, mallocForeignPtrArray, newForeignPtr ) #if defined(__GLASGOW_HASKELL__) import qualified Foreign.Concurrent as FC ( newForeignPtr ) import System.Posix ( handleToFd ) #endif #ifdef DEBUG_PS import Foreign.ForeignPtr ( addForeignPtrFinalizer ) #endif import Foreign.Ptr ( FunPtr ) debugForeignPtr :: ForeignPtr a -> String -> IO () #ifdef DEBUG_PS foreign import ccall unsafe "static fpstring.h debug_alloc" debug_alloc :: Ptr a -> CString -> IO () foreign import ccall unsafe "static fpstring.h & debug_free" debug_free :: FunPtr (Ptr a -> IO ()) debugForeignPtr fp n = withCString n $ \cname-> withForeignPtr fp $ \p-> do debug_alloc p cname addForeignPtrFinalizer debug_free fp #else debugForeignPtr _ _ = return () #endif foreign import ccall unsafe "static fpstring.h dumb_malloc" dumb_malloc :: Int -> IO (Ptr Word8) foreign import ccall unsafe "static stdio.h &free" c_free :: FunPtr (Ptr Word8 -> IO ()) mallocForeignPtr :: Int -> IO (ForeignPtr Word8) mallocForeignPtr l = do when (l > 1000000) performGC if use_hideous_malloc_hack then do p <- dumb_malloc $ max 1 l when (p == nullPtr) $ fail $ "Memory allocation error! "++show l newForeignPtr c_free p else mallocForeignPtrArray l ---------------------------------------------------------------------------- --A way of creating ForeignPtrs outside the IO monad (althogh it still --isn't entirely "safe", but at least it's convenient. createPS :: Int -> (Ptr Word8 -> IO ()) -> PackedString createPS l write_ptr = unsafePerformIO $ do fp <- mallocForeignPtr l debugForeignPtr fp "createPS" withForeignPtr fp $ \p -> write_ptr p return $ PS fp 0 l -- ----------------------------------------------------------------------------- -- PackedString type declaration -- | A space-efficient representation of a 'String', which supports various -- efficient operations. A 'PackedString' contains full Unicode 'Char's. data PackedString = PS !(ForeignPtr Word8) !Int !Int -- ----------------------------------------------------------------------------- -- unsafeWithInternals -- | Do something with the internals of a PackedString. Beware of -- altering the contents! unsafeWithInternals :: PackedString -> (Ptr Word8 -> Int -> IO a) -> IO a unsafeWithInternals (PS fp s l) f = withForeignPtr fp $ \p -> f (p `plusPtr` s) l {-# INLINE (!) #-} (!) :: PackedString -> Int -> Word8 (PS x s _l) ! i = unsafePerformIO $ withForeignPtr x $ \p -> peekElemOff p (s+i) -- | i < 0 = error "Can't access negative element in PackedString." -- | i >= l = error "Out of range element in PackedString." -- | otherwise = unsafePerformIO $ withForeignPtr x $ \p -> peekElemOff p (s+i) -- ----------------------------------------------------------------------------- -- generatePS -- | Given the maximum size needed and a function to make the contents -- of a PackedString, generatePS makes the PackedString. The generating -- function is required to return the actual size (<= the maximum size). generatePS :: Int -> (Ptr Word8 -> IO Int) -> IO PackedString generatePS i f = do p <- mallocArray i i' <- f p p' <- reallocArray p i' fp <- newForeignPtr c_free p' return $ PS fp 0 i' instance Eq PackedString where (==) = psniceq foreign import ccall unsafe "static string.h memcmp" c_memcmp :: Ptr Word8 -> Ptr Word8 -> Int -> IO Int {-# INLINE psniceq #-} psniceq :: PackedString -> PackedString -> Bool psniceq a b | nullPS a && nullPS b = True psniceq (PS x1 s1 l1) (PS x2 s2 l2) = ((l1 == l2) &&) $ unsafePerformIO $ withForeignPtr x1 $ \p1-> withForeignPtr x2 $ \p2 -> if p1 == p2 && s1 == s2 then return True else liftM (==0) $ c_memcmp (p1 `plusPtr` s1) (p2 `plusPtr` s2) l1 instance Ord PackedString where compare = pscmp pscmp :: PackedString -> PackedString -> Ordering pscmp (PS x1 s1 l1) (PS x2 s2 l2) = unsafePerformIO $ withForeignPtr x1 $ \p1-> withForeignPtr x2 $ \p2 -> let doc :: Ptr Word8 -> Ptr Word8 -> IO Ordering st1 = p1 `plusPtr` s1 `plusPtr` l1 st2 = p2 `plusPtr` s2 `plusPtr` l2 doc w1 w2 = if w1 == st1 && w2 == st2 then return EQ else if w1 == st1 then return LT else if w2 == st2 then return GT else do h1 <- peek w1 h2 <- peek w2 if h1 < h2 then return LT else if h1 == h2 then doc (w1 `plusPtr` 1) (w2 `plusPtr` 1) else return GT in doc (p1 `plusPtr` s1) (p2 `plusPtr` s2) --instance Read PackedString: ToDo instance Show PackedString where showsPrec p ps r = showsPrec p (unpackPS ps) r -- ----------------------------------------------------------------------------- -- Constructor functions nilPS :: PackedString nilPS = unsafePerformIO $ do fp <- mallocForeignPtr 1 debugForeignPtr fp "nilPS" return $ PS fp 0 0 consPS :: Char -> PackedString -> PackedString consPS c cs = packString (c : (unpackPS cs)) -- ToDo:better -- | Convert a 'String' into a 'PackedString' packString :: String -> PackedString packString str = createPS (length str) $ \p -> pokeArray p $ map c2w str packWords :: [Word8] -> PackedString packWords s = createPS (length s) $ \p -> pokeArray p s {-# INLINE w2c #-} w2c :: Word8 -> Char w2c = chr . fromIntegral {-# INLINE c2w #-} c2w :: Char -> Word8 c2w = fromIntegral . ord -- ----------------------------------------------------------------------------- -- Destructor functions (taking PackedStrings apart) -- | Convert a 'PackedString' into a 'String' unpackPS :: PackedString -> String unpackPS (PS ps s l) = map w2c $ unsafePerformIO $ withForeignPtr ps $ \p -> peekArray l (p `plusPtr` s) {- unpackPS :: PackedString -> String unpackPS theps@(PS ps s l) | l >= 1024 = map w2c (unsafePerformIO (withForeignPtr ps $ \p -> peekArray 1024 (p `plusPtr` s))) ++ unpackPS (PS ps (s + 1024) (l - 1024)) | l >= 128 = map w2c (unsafePerformIO (withForeignPtr ps $ \p -> peekArray 128 (p `plusPtr` s))) ++ unpackPS (PS ps (s + 128) (l - 128)) | l > 0 = unsafeHeadPS theps : unpackPS (unsafeTailPS theps) | otherwise = "" -} {- unpackPS :: PackedString -> String unpackPS theps = if nullPS theps then [] else unsafeHeadPS theps : unpackPS (unsafeTailPS theps) -} unpackWords :: PackedString -> [Word8] unpackWords ps@(PS x s _) = if nullPS ps then [] else (unsafePerformIO $ withForeignPtr x $ \p -> peekElemOff p s) : unpackWords (unsafeTailPS ps) unpackPSfromUTF8 :: PackedString -> String unpackPSfromUTF8 (PS _ _ 0) = "" unpackPSfromUTF8 (PS x s l) = unsafePerformIO $ withForeignPtr x $ \p-> do outbuf <- mallocArray l lout <- utf8_to_ints outbuf (p `plusPtr` s) l when (lout < 0) $ error "Bad UTF8!" str <- (map chr) `liftM` peekArray lout outbuf free outbuf return str foreign import ccall unsafe "static fpstring.h utf8_to_ints" utf8_to_ints :: Ptr Int -> Ptr Word8 -> Int -> IO Int -- ----------------------------------------------------------------------------- -- List-mimicking functions for PackedStrings {-# INLINE lengthPS #-} lengthPS :: PackedString -> Int lengthPS (PS _ _ l) = l {-# INLINE indexPSW #-} indexPSW :: PackedString -> Int -> Word8 indexPSW theps i | i < 0 = error "Negative index in indexPS" | i >= lengthPS theps = error "Out of bounds in indexPS" | otherwise = theps ! i {-# INLINE indexPS #-} indexPS :: PackedString -> Int -> Char indexPS theps i | i < 0 = error "Negative index in indexPS" | i >= lengthPS theps = error "Out of bounds in indexPS" | otherwise = w2c $ theps ! i {-# INLINE lastPS #-} lastPS :: PackedString -> Char lastPS ps@(PS x s l) -- ps ! 0 is inlined manually to eliminate a (+0) | nullPS ps = error "FastPackedString.lastPS: last []" | otherwise = w2c $ unsafePerformIO $ withForeignPtr x $ \p -> peekElemOff p (s+l-1) {-# INLINE headPS #-} headPS :: PackedString -> Char headPS ps@(PS x s _) -- ps ! 0 is inlined manually to eliminate a (+0) | nullPS ps = error "FastPackedString.headPS: head []" | otherwise = w2c $ unsafePerformIO $ withForeignPtr x $ \p -> peekElemOff p s {-# INLINE unsafeHeadPS #-} unsafeHeadPS :: PackedString -> Char unsafeHeadPS (PS x s _) -- ps ! 0 is inlined manually to eliminate a (+0) = w2c $ unsafePerformIO $ withForeignPtr x $ \p -> peekElemOff p s {-# INLINE tailPS #-} tailPS :: PackedString -> PackedString tailPS ps | len <= 0 = error "FastPackedString.tailPS: tail []" | len == 1 = nilPS | otherwise = substrPS ps 1 (len - 1) where len = lengthPS ps {-# INLINE unsafeTailPS #-} unsafeTailPS :: PackedString -> PackedString unsafeTailPS (PS ps s l) | l == 1 = nilPS | otherwise = PS ps (s+1) (l-1) {-# INLINE nullPS #-} nullPS :: PackedString -> Bool nullPS (PS _ _ l) = l == 0 appendPS :: PackedString -> PackedString -> PackedString appendPS xs ys | nullPS xs = ys | nullPS ys = xs | otherwise = concatPS [xs,ys] mapPS :: (Char -> Char) -> PackedString -> PackedString mapPS func (PS ps s l) = createPS l $ \p-> withForeignPtr ps $ \f-> mint (f `plusPtr` s) p l where mint :: Ptr Word8 -> Ptr Word8 -> Int -> IO () mint _ _ 0 = return () mint f t len = do val <- peek f poke t $ c2w $ func $ w2c val mint (f `plusPtr` 1) (t `plusPtr` 1) (len - 1) --filterPS :: (Char -> Bool) -> PackedString -> PackedString {-or String?-} --filterPS pred ps = packString (filter pred (unpackPS ps)) foldlPS :: (a -> Char -> a) -> a -> PackedString -> a foldlPS f b ps = foldl f b (unpackPS ps) foldrPS :: (Char -> a -> a) -> a -> PackedString -> a foldrPS f v ps = foldr f v (unpackPS ps) {-# INLINE takePS #-} takePS :: Int -> PackedString -> PackedString takePS n ps@(PS x s _) = if n >= lengthPS ps then ps else PS x s n -- substrPS ps 0 (n - 1) {-# INLINE dropPS #-} dropPS :: Int -> PackedString -> PackedString dropPS n ps@(PS x s l) | n >= lengthPS ps = nilPS | otherwise = PS x (s+n) (l-n) -- substrPS ps n (lengthPS ps - 1) {-# INLINE splitAtPS #-} splitAtPS :: Int -> PackedString -> (PackedString, PackedString) splitAtPS n ps = (takePS n ps, dropPS n ps) -- This must be fast, it's used heavily in Printer. -- jch anyPS :: (Char -> Bool) -> PackedString -> Bool anyPS f (PS x s l) = unsafePerformIO $ withForeignPtr x $ \ptr -> lookat (ptr `plusPtr` s) (ptr `plusPtr` (s+l)) where lookat :: Ptr Word8 -> Ptr Word8 -> IO Bool lookat p st | p == st = return False | otherwise = do w <- peek p if f $ w2c w then return True else lookat (p `plusPtr` 1) st findWhenPS :: (Char -> Bool) -> PackedString -> Int findWhenPS f ps = seq f $ if nullPS ps then 0 else if f $ unsafeHeadPS ps then 0 else 1 + findWhenPS f (unsafeTailPS ps) findFromEndUntilPS :: (Char -> Bool) -> PackedString -> Int findFromEndUntilPS f ps@(PS x s l) = seq f $ if nullPS ps then 0 else if f $ lastPS ps then l else findFromEndUntilPS f (PS x s (l-1)) {-# INLINE takeWhilePS #-} takeWhilePS :: (Char -> Bool) -> PackedString -> PackedString takeWhilePS f ps = seq f $ takePS (findWhenPS (not . f) ps) ps {-# INLINE dropWhilePS #-} dropWhilePS :: (Char -> Bool) -> PackedString -> PackedString dropWhilePS f ps = seq f $ dropPS (findWhenPS (not . f) ps) ps {-# INLINE dropWhitePS #-} dropWhitePS :: PackedString -> PackedString dropWhitePS (PS x s l) = unsafePerformIO $ withForeignPtr x $ \p-> do i <- first_nonwhite (p `plusPtr` s) l return $ if i == l then nilPS else PS x (s+i) (l-i) foreign import ccall unsafe "fpstring.h first_nonwhite" first_nonwhite :: Ptr Word8 -> Int -> IO Int foreign import ccall unsafe "fpstring.h first_white" first_white :: Ptr Word8 -> Int -> IO Int {-# INLINE is_funky #-} is_funky :: PackedString -> Bool is_funky (PS x s l) = unsafePerformIO $ withForeignPtr x $ \p-> (/=0) `liftM` has_funky_char (p `plusPtr` s) l foreign import ccall unsafe "fpstring.h has_funky_char" has_funky_char :: Ptr Word8 -> Int -> IO Int elemPS :: Char -> PackedString -> Bool elemPS c ps = c `elem` unpackPS ps spanPS :: (Char -> Bool) -> PackedString -> (PackedString, PackedString) spanPS p ps = breakPS (not . p) ps spanEndPS :: (Char -> Bool) -> PackedString -> (PackedString, PackedString) spanEndPS p ps = splitAtPS (findFromEndUntilPS (not.p) ps) ps breakPS :: (Char -> Bool) -> PackedString -> (PackedString, PackedString) breakPS p ps = case findWhenPS p ps of n -> (takePS n ps, dropPS n ps) {-# INLINE breakOnPS #-} breakOnPS :: Char -> PackedString -> (PackedString, PackedString) breakOnPS c p = case findPS c p of Nothing -> (p,nilPS) Just n -> (takePS n p, dropPS n p) {-# INLINE breakWhitePS #-} breakWhitePS :: PackedString -> (PackedString,PackedString) breakWhitePS (PS x s l) = unsafePerformIO $ withForeignPtr x $ \p-> do i <- first_white (p `plusPtr` s) l if i == 0 then return (nilPS, PS x s l) else if i == l then return (PS x s l, nilPS) else return (PS x s i, PS x (s+i) (l-i)) {-# INLINE breakFirstPS #-} breakFirstPS :: Char -> PackedString -> Maybe (PackedString,PackedString) breakFirstPS c p = case findPS c p of Nothing -> Nothing Just n -> Just (takePS n p, dropPS (n+1) p) {-# INLINE breakLastPS #-} breakLastPS :: Char -> PackedString -> Maybe (PackedString,PackedString) breakLastPS c p = case findLastPS c p of Nothing -> Nothing Just n -> Just (takePS n p, dropPS (n+1) p) {-# INLINE linesPS #-} linesPS :: PackedString -> [PackedString] linesPS ps = case wfindPS (c2w '\n') ps of Nothing -> [ps] Just n -> takePS n ps : linesPS (dropPS (n+1) ps) unlinesPS :: [PackedString] -> PackedString unlinesPS ss = concatPS $ intersperse_newlines ss where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s) intersperse_newlines s = s newline = packString "\n" wordsPS :: PackedString -> [PackedString] wordsPS ps = splitWithPS isSpace ps reversePS :: PackedString -> PackedString reversePS ps = packString (reverse (unpackPS ps)) foreign import ccall unsafe "static string.h memcpy" c_memcpy :: Ptr Word8 -> Ptr Word8 -> Int -> IO () concatPS :: [PackedString] -> PackedString concatPS [] = nilPS concatPS [ps] = ps concatPS xs = unsafePerformIO $ do let start_size = 1024 p <- mallocArray start_size f p 0 1024 xs where f ptr len _ [] = do ptr' <- reallocArray ptr len fp <- newForeignPtr c_free ptr' return $ PS fp 0 len f ptr len to_go pss@(PS p s l:pss') | l <= to_go = do withForeignPtr p $ \pf -> c_memcpy (ptr `advancePtr` len) (pf `advancePtr` s) l f ptr (len + l) (to_go - l) pss' | otherwise = do let new_total = ((len + to_go) * 2) `max` (len + l) ptr' <- reallocArray ptr new_total f ptr' len (new_total - len) pss -- ----------------------------------------------------------------------------- -- concatLenPS -- | Camse as concatPS only you tell it how big the result will be. -- If you lie thenBad Things will happen. concatLenPS :: Int -> [PackedString] -> PackedString concatLenPS n [] = n `seq` nilPS concatLenPS _ [ps] = ps concatLenPS total_length pss = createPS total_length $ \p-> cpPSs p pss where cpPSs :: Ptr Word8 -> [PackedString] -> IO () cpPSs p (PS x s l:rest) = do withForeignPtr x $ \pf -> c_memcpy p (pf `plusPtr` s) l cpPSs (p `plusPtr` l) rest cpPSs _ [] = return () {-# INLINE findPS #-} findPS :: Char -> PackedString -> Maybe Int findPS c ps = wfindPS (c2w c) ps {-# INLINE wfindPS #-} wfindPS :: Word8 -> PackedString -> Maybe Int wfindPS c (PS x s l) = unsafePerformIO $ withForeignPtr x $ \p-> let p' = p `plusPtr` s q = memchr p' (fromIntegral c) (fromIntegral l) in return $ if q == nullPtr then Nothing else Just (q `minusPtr` p') foreign import ccall unsafe "string.h memchr" memchr :: Ptr Word8 -> CInt -> CSize -> Ptr Word8 {-# INLINE findLastPS #-} findLastPS :: Char -> PackedString -> Maybe Int findLastPS c ps = wfindLastPS (c2w c) ps {-# INLINE wfindLastPS #-} wfindLastPS :: Word8 -> PackedString -> Maybe Int wfindLastPS c (PS x s l) = unsafePerformIO $ withForeignPtr x $ \p-> findit (-1) (p `plusPtr` s) 0 where findit h p i = if i >= l then if h < 0 then return Nothing else return $ Just h else do here <- peekElemOff p i if c == here then findit i p (i+1) else findit h p (i+1) ------------------------------------------------------------ {-# INLINE splitPS #-} splitPS :: Char -> PackedString -> [PackedString] splitPS c = wsplitPS (c2w c) {-# INLINE wsplitPS #-} wsplitPS :: Word8 -> PackedString -> [PackedString] wsplitPS c ps = case wfindPS c ps of Nothing -> if nullPS ps then [] else [ps] Just n -> takePS n ps : wsplitPS c (dropPS (n+1) ps) splitWithPS :: (Char -> Bool) -> PackedString -> [PackedString] splitWithPS f ps = case [ m | m <- [0..lengthPS ps-1], f (w2c (ps ! m)) ] of [] -> if nullPS ps then [] else [ps] (n:_) -> takePS n ps : splitWithPS f (dropPS (n+1) ps) -- ----------------------------------------------------------------------------- -- Local utility functions -- The definition of @_substrPS@ is essentially: -- @take (end - begin + 1) (drop begin str)@. substrPS :: PackedString -> Int -> Int -> PackedString substrPS (PS ps s _) begin end = PS ps (s+begin) (1+end-begin) --substrPS (PS ps s l) begin end -- | end <= l && begin <= end && begin >= 0 = PS ps (s+begin) (1+end-begin) -- | otherwise = bug "substrPS out of bounds" -- ----------------------------------------------------------------------------- -- hPutPS -- | Outputs a 'PackedString' to the specified 'Handle'. -- -- NOTE: the representation of the 'PackedString' in the file is assumed to -- be in the ISO-8859-1 encoding. In other words, only the least signficant -- byte is taken from each character in the 'PackedString'. hPutPS :: Handle -> PackedString -> IO () hPutPS _ (PS _ _ 0) = return () hPutPS h (PS ps 0 l) = withForeignPtr ps $ \p-> hPutBuf h p l hPutPS h (PS ps s l) = withForeignPtr ps $ \p-> hPutBuf h (p `plusPtr` s) l -- ----------------------------------------------------------------------------- -- hGetPS -- | Read a 'PackedString' directly from the specified 'Handle'. This -- is far more efficient than reading the characters into a 'String' -- and then using 'packString'. -- -- NOTE: as with 'hPutPS', the string representation in the file is -- assumed to be ISO-8859-1. hGetPS :: Handle -> Int -> IO PackedString hGetPS _ 0 = return nilPS hGetPS h i = do fp <- mallocForeignPtr i debugForeignPtr fp $ "hGetPS "++show h l <- withForeignPtr fp $ \p-> hGetBuf h p i return $ PS fp 0 l -- ----------------------------------------------------------------------------- -- hGetContentsPS -- | Read entire handle contents into a 'PackedString'. -- -- NOTE: as with 'hGetPS', the string representation in the file is -- assumed to be ISO-8859-1. hGetContentsPS :: Handle -> IO PackedString hGetContentsPS h = do let start_size = 1024 p <- mallocArray start_size i <- hGetBuf h p start_size if i < start_size then do p' <- reallocArray p i fp <- newForeignPtr c_free p' return $ PS fp 0 i else f p start_size where f p s = do let s' = 2 * s p' <- reallocArray p s' i <- hGetBuf h (p' `plusPtr` s) s if i < s then do let i' = s + i p'' <- reallocArray p' i' fp <- newForeignPtr c_free p'' return $ PS fp 0 i' else f p' s' -- ----------------------------------------------------------------------------- -- readFilePS -- | Read an entire file directly into a 'PackedString'. This is far more -- efficient than reading the characters into a 'String' and then using -- 'packString'. It also may be more efficient than opening the file and -- reading it using hGetPS. -- -- NOTE: as with 'hGetPS', the string representation in the file is -- assumed to be ISO-8859-1. readFilePS :: FilePath -> IO PackedString readFilePS f = do h <- openBinaryFile f ReadMode l <- hFileSize h s <- hGetPS h $ fromIntegral l hClose h return s -- ----------------------------------------------------------------------------- -- writeFilePS -- | Write a 'PackedString' to a file. writeFilePS :: FilePath -> PackedString -> IO () writeFilePS f ps = do h <- openBinaryFile f WriteMode hPutPS h ps hClose h -- ----------------------------------------------------------------------------- -- gzReadFilePS -- | Read an entire file, which may or may not be gzip compressed, directly -- into a 'PackedString'. foreign import ccall unsafe "static zlib.h gzopen" c_gzopen :: CString -> CString -> IO (Ptr ()) foreign import ccall unsafe "static zlib.h gzclose" c_gzclose :: Ptr () -> IO () foreign import ccall unsafe "static zlib.h gzread" c_gzread :: Ptr () -> Ptr Word8 -> Int -> IO Int gzReadFilePS :: FilePath -> IO PackedString gzReadFilePS f = do h <- openBinaryFile f ReadMode header <- hGetPS h 2 if header /= packString "\31\139" then do hClose h mmapFilePS f else do hSeek h SeekFromEnd (-4) len <- hGetLittleEndInt h hClose h withCString f $ \fstr-> withCString "rb" $ \rb-> do gzf <- c_gzopen fstr rb when (gzf == nullPtr) $ fail $ "problem opening file "++f fp <- mallocForeignPtr len debugForeignPtr fp $ "gzReadFilePS "++f lread <- withForeignPtr fp $ \p -> c_gzread gzf p len c_gzclose gzf when (lread /= len) $ fail $ "problem gzreading file "++f return $ PS fp 0 len hGetLittleEndInt :: Handle -> IO Int hGetLittleEndInt h = do b1 <- ord `liftM` hGetChar h b2 <- ord `liftM` hGetChar h b3 <- ord `liftM` hGetChar h b4 <- ord `liftM` hGetChar h return $ b1 + 256*b2 + 65536*b3 + 16777216*b4 -- ----------------------------------------------------------------------------- -- mmapFilePS -- | Like readFilePS, this reads an entire file directly into a -- 'PackedString', but it is even more efficient. It involves directly -- mapping the file to memory. This has the advantage that the contents of -- the file never need to be copied. Also, under memory pressure the page -- may simply be discarded, wile in the case of readFilePS it would need to -- be written to swap. If you read many small files, mmapFilePS will be -- less memory-efficient than readFilePS, since each mmapFilePS takes up a -- separate page of memory. Also, you can run into bus errors if the file -- is modified. NOTE: as with 'readFilePS', the string representation in -- the file is assumed to be ISO-8859-1. mmapFilePS :: FilePath -> IO PackedString mmapFilePS f = if use_mmap then do (fp,l) <- mmap f return $ PS fp 0 l else readFilePS f #if defined(__GLASGOW_HASKELL__) foreign import ccall unsafe "static fpstring.h my_mmap" my_mmap :: Int -> Int -> IO (Ptr Word8) foreign import ccall unsafe "static sys/mman.h munmap" c_munmap :: Ptr Word8 -> Int -> IO Int foreign import ccall unsafe "static unistd.h close" c_close :: Int -> IO Int #endif mmap :: FilePath -> IO (ForeignPtr Word8, Int) mmap f = do h <- openBinaryFile f ReadMode l <- fromIntegral `liftM` hFileSize h -- Don't bother mmaping small files because each mmapped file takes up -- at least one full VM block. if l < mmap_limit then do thefp <- mallocForeignPtr l debugForeignPtr thefp $ "mmap short file "++f withForeignPtr thefp $ \p-> hGetBuf h p l hClose h return (thefp, l) else do #if defined(__GLASGOW_HASKELL__) fd <- fromIntegral `liftM` handleToFd h p <- my_mmap l fd fp <- if p == nullPtr then #else fp <- #endif do thefp <- mallocForeignPtr l debugForeignPtr thefp $ "mmap short file "++f withForeignPtr thefp $ \p' -> hGetBuf h p' l return thefp #if defined(__GLASGOW_HASKELL__) else do fp <- FC.newForeignPtr p (do {c_munmap p l; return (); }) debugForeignPtr fp $ "mmap "++f return fp c_close fd #endif hClose h return (fp, l) where mmap_limit = 16*1024 -- ------------------------------------------------------------------------- -- readIntPS -- | readIntPS skips any whitespace at the beginning of its argument, and -- reads an Int from the beginning of the PackedString. If there is no -- integer at the beginning of the string, it returns Nothing, otherwise it -- just returns the int read, along with a PackedString containing the -- remainder of its input. The actual parsing is done by the standard C -- library function strtol. foreign import ccall unsafe "static stdlib.h strtol" c_strtol :: Ptr Word8 -> Ptr (Ptr Word8) -> Int -> IO CLong readIntPS :: PackedString -> Maybe (Int, PackedString) readIntPS (PS x s l) = unsafePerformIO $ withForeignPtr x $ \p-> with p $ \endpp -> do val <- c_strtol (p `plusPtr` s) endpp 0 skipped <- (`minusPtr` (p `plusPtr` s)) `liftM` peek endpp if skipped == 0 then return Nothing else return $ Just (fromIntegral val, PS x (s+skipped) (l-skipped)) -- ------------------------------------------------------------------------- -- fromPS2Hex foreign import ccall unsafe "static fpstring.h conv_to_hex" conv_to_hex :: Ptr Word8 -> Ptr Word8 -> Int -> IO () fromPS2Hex :: PackedString -> PackedString fromPS2Hex (PS x s l) = createPS (2*l) $ \p -> withForeignPtr x $ \f -> conv_to_hex p (f `plusPtr` s) l -- ------------------------------------------------------------------------- -- fromHex2PS foreign import ccall unsafe "static fpstring.h conv_from_hex" conv_from_hex :: Ptr Word8 -> Ptr Word8 -> Int -> IO () fromHex2PS :: PackedString -> PackedString fromHex2PS (PS x s l) = createPS (l `div` 2) $ \p -> withForeignPtr x $ \f -> conv_from_hex p (f `plusPtr` s) (l `div` 2) -- ------------------------------------------------------------------------- -- mylexPS -- | mylexPS reads a single token from the PackedString, where a token is -- defined as a sequence of non-whitespace characters which are delimited -- by whitespace. mylexPS :: PackedString -> Maybe (PackedString,PackedString) mylexPS s = if nullPS (dropWhitePS s) then Nothing else Just $ breakWhitePS $ dropWhitePS s -- ------------------------------------------------------------------------- -- betweenLinesPS -- | betweenLinesPS returns the PackedString between the two lines given, -- or Nothing if they do not appear. betweenLinesPS :: PackedString -> PackedString -> PackedString -> Maybe (PackedString) betweenLinesPS start end ps = case break (start ==) (linesPS ps) of (_, _:rest@(PS ps1 s1 _:_)) -> case break (end ==) rest of (_, PS _ s2 _:_) -> Just $ PS ps1 s1 (s2 - s1) _ -> Nothing _ -> Nothing