Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to read CSV without header row #82

Merged
merged 9 commits into from
Dec 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pydax/loaders/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic
- ``columns`` key specifies the data type of each column. Each data type corresponds to a Pandas'
supported dtype. If unspecified, then it is default.
- ``delimiter`` key specifies the delimiter of the input CSV file.
- ``no_header`` key specifies if the first row of the CSV file contains the headers. Defaults to False.
If the value is set to anything "truthy" in Python, the first row of the CSV will be read as data.
- ``encoding`` key specifies the encoding of the CSV file. Defaults to UTF-8.
:raises TypeError: ``path`` is not a path object.
"""
Expand All @@ -55,9 +57,18 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic
else:
dtypes[column] = type_

names = None
header = None
if options.get('no_header'):
# If no header use the columns provided in schema
names = [*options.get('columns', {})]
else:
header = 'infer'

return pd.read_csv(path, dtype=dtypes,
# The following line after "if" is for circumventing
# https://github.com/pandas-dev/pandas/issues/38489
parse_dates=parse_dates if len(parse_dates) > 0 else False,
header=header, names=names,
encoding=options.get('encoding', 'utf-8'),
delimiter=options.get('delimiter', ','))
16 changes: 16 additions & 0 deletions tests/test_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,19 @@ def test_csv_pandas_loader_no_encoding(self, tmp_path, noaa_jfk_schema):

del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['encoding']
self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema)

def test_csv_pandas_header(self, tmp_path, noaa_jfk_schema):
"Test CSVPandasLoader header options"

noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = True
with pytest.raises(ValueError) as exinfo: # Pandas should error from trying to read string as another dtype
Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)
assert('could not convert string to float' in str(exinfo.value))

false_test_cases = [False, '', None] # These should all be treated as False
for case in false_test_cases:
noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = case
self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema)

del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header']
self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema)