-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add CSV file reader function and test case
- Loading branch information
Showing
7 changed files
with
202 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
use std::fs::File; | ||
use std::io::Seek; | ||
use std::sync::Arc; | ||
|
||
use arrow::array::RecordBatch; | ||
use arrow::csv::reader::Format; | ||
use arrow::csv::{Reader, ReaderBuilder}; | ||
use arrow::datatypes::SchemaRef; | ||
|
||
use crate::datasource::memory::MemoryDataSource; | ||
use crate::datasource::DataSource; | ||
use crate::error::{Error, Result}; | ||
use crate::logical::expr::LogicalExpr; | ||
|
||
use super::DataFilePath; | ||
|
||
pub struct CsvReadOptions { | ||
has_header: bool, | ||
delimiter: u8, | ||
quote: Option<u8>, | ||
escape: Option<u8>, | ||
} | ||
|
||
impl Default for CsvReadOptions { | ||
fn default() -> Self { | ||
Self { | ||
has_header: true, | ||
delimiter: b',', | ||
quote: None, | ||
escape: None, | ||
} | ||
} | ||
} | ||
|
||
pub fn read_csv<T: DataFilePath>(path: T, options: CsvReadOptions) -> Result<Arc<dyn DataSource>> { | ||
let url = path.to_url()?; | ||
|
||
match url.scheme() { | ||
"file" => { | ||
// FIXME this may not ok when csv file is too big to read into memory | ||
let mut file = | ||
File::open(url.path()).map_err(|e| Error::InternalError(e.to_string()))?; | ||
|
||
let mut format = Format::default() | ||
.with_header(options.has_header) | ||
.with_delimiter(options.delimiter); | ||
|
||
if let Some(quote) = options.quote { | ||
format = format.with_quote(quote); | ||
} | ||
if let Some(escape) = options.escape { | ||
format = format.with_escape(escape); | ||
} | ||
|
||
// max records set 2 means we only read the first 2 records to infer the schema | ||
// first line is header | ||
// second line is data to infer the data type | ||
let (schema, _) = format | ||
.infer_schema(&mut file, None) | ||
.map_err(|e| Error::ArrowError(e))?; | ||
|
||
// rewind the file to the beginning because the schema inference | ||
file.rewind().unwrap(); | ||
|
||
let schema = Arc::new(schema); | ||
|
||
ReaderBuilder::new(schema.clone()) | ||
.with_format(format) | ||
.build(file) | ||
.and_then(|reader| reader.into_iter().collect()) | ||
.map(|data| Arc::new(MemoryDataSource::new(schema, data)) as Arc<dyn DataSource>) | ||
.map_err(|e| Error::ArrowError(e)) | ||
} | ||
_ => unimplemented!(), | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use arrow::util; | ||
|
||
use super::*; | ||
|
||
#[test] | ||
fn test_read_csv() { | ||
let options = CsvReadOptions::default(); | ||
|
||
let source = read_csv("tests/testdata/file/case1.csv", options).unwrap(); | ||
|
||
println!( | ||
"{}", | ||
util::pretty::pretty_format_batches(&source.scan(None, &vec![]).unwrap()).unwrap() | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
pub mod csv; | ||
|
||
use std::{ | ||
fs::{self, File}, | ||
vec, | ||
}; | ||
|
||
use arrow::{ | ||
csv::{reader::BufReader, Reader}, | ||
datatypes::SchemaRef, | ||
}; | ||
use url::Url; | ||
|
||
use crate::{ | ||
error::{Error, Result}, | ||
logical::expr::LogicalExpr, | ||
}; | ||
|
||
use super::DataSource; | ||
|
||
pub trait DataFilePath { | ||
fn to_url(self) -> Result<Url>; | ||
} | ||
|
||
impl DataFilePath for String { | ||
fn to_url(self) -> Result<Url> { | ||
parse_path(self) | ||
} | ||
} | ||
|
||
impl DataFilePath for &str { | ||
fn to_url(self) -> Result<Url> { | ||
parse_path(self) | ||
} | ||
} | ||
|
||
fn parse_path<S: AsRef<str>>(path: S) -> Result<Url> { | ||
match path.as_ref().parse::<Url>() { | ||
Ok(url) => Ok(url), | ||
Err(url::ParseError::RelativeUrlWithoutBase) => fs::canonicalize(path.as_ref()) | ||
.and_then(|absolute| Ok(Url::from_file_path(absolute).unwrap())) | ||
.map_err(|e| { | ||
Error::InternalError(format!( | ||
"file path: {}, err: {}", | ||
path.as_ref(), | ||
e.to_string() | ||
)) | ||
}), | ||
Err(e) => Err(Error::InternalError(e.to_string())), | ||
} | ||
} | ||
|
||
/// FileSource is a data source for reading data from a file | ||
/// different file formats have different Readers | ||
/// all readers from arrows | ||
#[derive(Debug)] | ||
pub struct FileSource<R> { | ||
schema: SchemaRef, | ||
reader: R, | ||
} | ||
|
||
impl FileSource<BufReader<File>> { | ||
pub fn new(schema: SchemaRef, reader: BufReader<File>) -> Self { | ||
Self { schema, reader } | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -80,4 +80,6 @@ mod tests { | |
|
||
Ok(()) | ||
} | ||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
id,name,localtion | ||
1,BeiJing University,China BeiJing |