Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Issue 4] Incremental parsing for CSV with Headers #22

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions Data/Csv/Parser/Megaparsec/Incremental.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
module Data.Csv.Parser.Megaparsec.Incremental (Parser(..), decode) where

import qualified Debug.Trace as DT

import Data.ByteString.Lazy as BL

import Data.Csv hiding (Parser, decode,
record, runParser)
import qualified Data.Csv as C
import qualified Data.Csv.Parser.Megaparsec.Internals as I

import qualified Data.Vector as V

import Text.Megaparsec (ParseErrorBundle, parse)


type ParseError = ParseErrorBundle BL.ByteString I.ConversionError

data Parser a
= Fail !BL.ByteString ParseError
| Many (V.Vector (Either ParseError a)) (BL.ByteString -> Parser a)
| Done (V.Vector (Either ParseError a))

-- | Parses incrementally using 'csvWithHeader' and 'defaultDecodeOptions'.
--
-- Note: Given an empty 'ByteString' will make the parser fail. Therefore, make
-- sure you handle that one level above (before passing that to this parser).
decode
:: C.FromRecord a
=> Parser a
decode = Many V.empty (go V.empty)
where
runParser :: C.FromRecord a => BL.ByteString -> Either ParseError a
runParser =
parse (I.record (C.decDelimiter C.defaultDecodeOptions) C.parseRecord) ""
-- parse (I.record (C.decDelimiter C.defaultDecodeOptions) (_ C.parseRecord)) ""

go :: C.FromRecord a => V.Vector (Either ParseError a) -> ByteString -> Parser a
go acc bs =
if BL.null bs
-- if BL.null (DT.traceShowId bs)

then Done acc
else
case runParser input of
Left e ->
Many (Left e `V.cons` acc) (go V.empty . (rest <>))
Right v ->
Many (Right v `V.cons` acc) (go V.empty . (rest <>))
where
-- TODO: `span` and `break` put '\n' on the left but we need to get rid
-- of it otherwise we end up accumulating the rest.
eol = 10
(input, rest) = DT.traceShowId $ BL.span (/= eol) bs


-- go' bs _ (Left pe) =
-- FailH bs (errorBundlePretty pe)

-- go' _ acc (Right (h, r)) =
-- if V.null r
-- then DoneH h acc
-- else PartialH $ \rbs -> go' rbs (r <> acc) (parser rbs)
2 changes: 2 additions & 0 deletions cassava-megaparsec.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ library
, vector >= 0.11 && < 0.13
exposed-modules: Data.Csv.Parser.Megaparsec
, Data.Csv.Parser.Megaparsec.Internals
, Data.Csv.Parser.Megaparsec.Incremental
if flag(dev)
ghc-options: -Wall -Werror
else
Expand All @@ -56,6 +57,7 @@ test-suite tests
, hspec >= 2.0 && < 3.0
, hspec-megaparsec >= 2.0 && < 3.0
, vector >= 0.11 && < 0.13
other-modules: IncrementalSpec
if flag(dev)
ghc-options: -Wall -Werror
else
Expand Down
106 changes: 106 additions & 0 deletions tests/IncrementalSpec.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
{-# LANGUAGE DeriveGeneric #-}

module IncrementalSpec where

import Data.ByteString.Lazy (ByteString, drop, take)
import Data.ByteString.Lazy.Char8 (pack)
import Data.Csv (FromRecord)
import Data.Csv.Parser.Megaparsec.Incremental (Parser (..), decode)
import qualified Data.Csv.Parser.Megaparsec.Internals as I
import Data.Either (isRight)
import Data.List (isInfixOf, uncons)
import qualified Data.Vector as V
import qualified Debug.Trace as DT
import GHC.Generics
import Prelude hiding (drop, take)
import Test.Hspec
import Text.Megaparsec (ParseErrorBundle)

spec :: Spec
spec =
describe "Decoding without headers" decodingSpec

data Person = Person { name :: String, age :: Int } deriving (Show, Generic)

instance FromRecord Person

dataFailure :: ByteString
dataFailure = pack $ unlines
[ "name,age"
, "Bart,10"
, "MrBurns,Unknown"
, "MoleMan,Unknown"
, "Lisa,8"
]

dataSuccess :: ByteString
dataSuccess = pack $ unlines
[ "name,age"
, "Bart,10"
, "Lisa,8"
, "Maggie,1"
]

decodingSpec :: Spec
decodingSpec = do
describe "given csv with wrong input in the middle" $ do
it "parses all the file and returns errors and " $
case DT.traceShowId $ V.break isRight (parseAccumulator dataFailure) of
(errors, successes) ->
V.length errors == 2
&& V.length successes == 2

-- it "performs three steps (2 partials + 1 fail) before failing" $
-- case runParserWithSteps dataFailure of
-- (Fail _ _, steps) -> steps == 3
-- _ -> False

-- describe "given csv with all right input" $ do
-- it "returns DoneH with the comsumed input" $
-- case runParserWithSteps dataSuccess of
-- (Done h r, _) -> V.length r == 3
-- _ -> False

-- it "performs four steps (3 partial + 1 done) before finishing" $
-- case runParserWithSteps dataSuccess of
-- (Done _ _, steps) -> steps == 4
-- _ -> False


-- | Helpers

-- Parses a single line with a header at a time
-- feed :: Maybe (String, [String]) -> (ByteString -> HeaderParser (V.Vector a)) -> HeaderParser (V.Vector a)
-- feed input f =
-- case input of
-- Just (firstLine, nextLine:_) -> f (pack $ unlines [firstLine, nextLine])
-- Just (_, []) -> DoneH mempty V.empty
-- _ -> DoneH mempty V.empty

-- Parses the given string until the end
-- goParser :: Int -> String -> HeaderParser (V.Vector a) -> (HeaderParser (V.Vector a), Int)
-- goParser !acc input (PartialH f) =
-- goParser (acc + 1) input (feed input' f)
-- where
-- input' =
-- case uncons (lines input) of
-- Just (fl, nls) -> Just (fl, drop acc nls)
-- _ -> Nothing
-- goParser acc _ r = (r, acc)

type ParseError = ParseErrorBundle ByteString I.ConversionError

parseAccumulator :: ByteString -> V.Vector (Either ParseError Person)
parseAccumulator input =
let loop _ acc (Fail bs pe) = error "unexpected error"
loop i acc (Many rs k) = loop (i + 1) (acc <> rs) (feed i k)
loop _ acc (Done rs) = acc <> rs
feed i k =
if i > 10
then k mempty
else k (take size $ drop n input)
where
size = 10
n = i * size
in
loop 0 V.empty decode
21 changes: 12 additions & 9 deletions tests/Spec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@

module Main (main) where

import Data.ByteString (ByteString)
import Data.Csv hiding (decode, decodeWith, decodeByName, decodeByNameWith)
import Data.Csv.Parser.Megaparsec
import Data.Vector (Vector)
import Test.Hspec
import Test.Hspec.Megaparsec
import qualified Data.ByteString.Lazy as BL
import qualified Data.Vector as V
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as BL
import Data.Csv hiding (decode, decodeByName,
decodeByNameWith, decodeWith)
import Data.Csv.Parser.Megaparsec
import Data.Vector (Vector)
import qualified Data.Vector as V
import qualified IncrementalSpec
import Test.Hspec
import Test.Hspec.Megaparsec

#if !MIN_VERSION_base(4,8,0)
import Control.Applicative ((<$>))
import Control.Applicative ((<$>))
#endif

main :: IO ()
Expand All @@ -25,6 +27,7 @@ spec = do
describe "decodeWith" decodeWithSpec
describe "decodeByName" decodeByNameSpec
describe "decodeByNameWith" decodeByNameWithSpec
describe "Incremental" IncrementalSpec.spec

decodeSpec :: Spec
decodeSpec = do
Expand Down