From 5cd8f4d5bfe412edebe9a9d112efbeadf22f3f58 Mon Sep 17 00:00:00 2001 From: Daniel Jalova Date: Mon, 14 Dec 2020 21:52:39 -0800 Subject: [PATCH 1/7] Add ability to read CSV without headers --- pydax/loaders/_table.py | 10 ++++++++++ tests/test_loaders.py | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/pydax/loaders/_table.py b/pydax/loaders/_table.py index b772336b..1c751f81 100644 --- a/pydax/loaders/_table.py +++ b/pydax/loaders/_table.py @@ -37,6 +37,7 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic - ``columns`` key specifies the data type of each column. Each data type corresponds to a Pandas' supported dtype. If unspecified, then it is default. - ``delimiter`` key specifies the delimiter of the input CSV file. + - ``header`` key specifies if the first row of the CSV file contains the headers. Defaults to True - ``encoding`` key specifies the encoding of the CSV file. Defaults to UTF-8. :raises TypeError: ``path`` is not a path object. """ @@ -55,9 +56,18 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic else: dtypes[column] = type_ + header = 'infer' + names = None + if options.get('header', True) is False: + header = None + # If no header use the columns provided in schema + names = [*options.get('columns', {})] + print(names) + return pd.read_csv(path, dtype=dtypes, # The following line after "if" is for circumventing # https://github.com/pandas-dev/pandas/issues/38489 parse_dates=parse_dates if len(parse_dates) > 0 else False, + header=header, names=names, encoding=options.get('encoding', 'utf-8'), delimiter=options.get('delimiter', ',')) diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 2f587abd..44dd133c 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -243,3 +243,13 @@ def test_csv_pandas_loader_no_encoding(self, tmp_path, noaa_jfk_schema): del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['encoding'] self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) + + def test_csv_pandas_header(self, tmp_path, noaa_jfk_schema): + "Test CSVPandasLoader header options" + + noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['header'] = True + self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) + + with pytest.raises(ValueError): # Pandas should error from trying to read string as another dtype + noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['header'] = False + dataset = Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) From 7c6b5eb54125d4b2ceaa6c44f5503bcb7cc7bf12 Mon Sep 17 00:00:00 2001 From: Daniel Jalova Date: Tue, 15 Dec 2020 10:33:08 -0800 Subject: [PATCH 2/7] Remove print --- pydax/loaders/_table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pydax/loaders/_table.py b/pydax/loaders/_table.py index 1c751f81..b48822b9 100644 --- a/pydax/loaders/_table.py +++ b/pydax/loaders/_table.py @@ -62,7 +62,6 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic header = None # If no header use the columns provided in schema names = [*options.get('columns', {})] - print(names) return pd.read_csv(path, dtype=dtypes, # The following line after "if" is for circumventing From a46e8a2a0b1b9de24001997873b6c8a426a3c784 Mon Sep 17 00:00:00 2001 From: Daniel Jalova Date: Tue, 15 Dec 2020 11:15:54 -0800 Subject: [PATCH 3/7] Fix lint --- pydax/loaders/_table.py | 4 +--- tests/test_loaders.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pydax/loaders/_table.py b/pydax/loaders/_table.py index b48822b9..3c886e36 100644 --- a/pydax/loaders/_table.py +++ b/pydax/loaders/_table.py @@ -56,10 +56,8 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic else: dtypes[column] = type_ - header = 'infer' names = None if options.get('header', True) is False: - header = None # If no header use the columns provided in schema names = [*options.get('columns', {})] @@ -67,6 +65,6 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic # The following line after "if" is for circumventing # https://github.com/pandas-dev/pandas/issues/38489 parse_dates=parse_dates if len(parse_dates) > 0 else False, - header=header, names=names, + names=names, encoding=options.get('encoding', 'utf-8'), delimiter=options.get('delimiter', ',')) diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 44dd133c..4ea5eb3c 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -252,4 +252,4 @@ def test_csv_pandas_header(self, tmp_path, noaa_jfk_schema): with pytest.raises(ValueError): # Pandas should error from trying to read string as another dtype noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['header'] = False - dataset = Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) + Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) From e837843a9b0ab218e0e393ab1d4ed24be29bc8c4 Mon Sep 17 00:00:00 2001 From: Daniel Jalova Date: Tue, 15 Dec 2020 14:17:08 -0800 Subject: [PATCH 4/7] Address review comments --- pydax/loaders/_table.py | 8 ++++++-- tests/test_loaders.py | 5 +++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pydax/loaders/_table.py b/pydax/loaders/_table.py index 3c886e36..e4c68b0f 100644 --- a/pydax/loaders/_table.py +++ b/pydax/loaders/_table.py @@ -38,6 +38,7 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic supported dtype. If unspecified, then it is default. - ``delimiter`` key specifies the delimiter of the input CSV file. - ``header`` key specifies if the first row of the CSV file contains the headers. Defaults to True + If the value set to anything other than False, it will be treated as True. - ``encoding`` key specifies the encoding of the CSV file. Defaults to UTF-8. :raises TypeError: ``path`` is not a path object. """ @@ -57,7 +58,10 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic dtypes[column] = type_ names = None - if options.get('header', True) is False: + header = None + if options.get('header', True): + header = 'infer' + else: # If no header use the columns provided in schema names = [*options.get('columns', {})] @@ -65,6 +69,6 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic # The following line after "if" is for circumventing # https://github.com/pandas-dev/pandas/issues/38489 parse_dates=parse_dates if len(parse_dates) > 0 else False, - names=names, + header=header, names=names, encoding=options.get('encoding', 'utf-8'), delimiter=options.get('delimiter', ',')) diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 4ea5eb3c..20ab2bd9 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -250,6 +250,7 @@ def test_csv_pandas_header(self, tmp_path, noaa_jfk_schema): noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['header'] = True self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) - with pytest.raises(ValueError): # Pandas should error from trying to read string as another dtype - noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['header'] = False + noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['header'] = False + with pytest.raises(ValueError) as exinfo: # Pandas should error from trying to read string as another dtype Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) + assert('could not convert string to float' in exinfo.value) From a0bc696f75a634bbc4cc6c9048ec78c13a174cab Mon Sep 17 00:00:00 2001 From: Daniel Jalova Date: Tue, 15 Dec 2020 14:58:29 -0800 Subject: [PATCH 5/7] Address PR comments --- pydax/loaders/_table.py | 2 +- tests/test_loaders.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pydax/loaders/_table.py b/pydax/loaders/_table.py index e4c68b0f..4bfbae09 100644 --- a/pydax/loaders/_table.py +++ b/pydax/loaders/_table.py @@ -37,7 +37,7 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic - ``columns`` key specifies the data type of each column. Each data type corresponds to a Pandas' supported dtype. If unspecified, then it is default. - ``delimiter`` key specifies the delimiter of the input CSV file. - - ``header`` key specifies if the first row of the CSV file contains the headers. Defaults to True + - ``header`` key specifies if the first row of the CSV file contains the headers. Defaults to True. If the value set to anything other than False, it will be treated as True. - ``encoding`` key specifies the encoding of the CSV file. Defaults to UTF-8. :raises TypeError: ``path`` is not a path object. diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 20ab2bd9..e004674a 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -253,4 +253,4 @@ def test_csv_pandas_header(self, tmp_path, noaa_jfk_schema): noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['header'] = False with pytest.raises(ValueError) as exinfo: # Pandas should error from trying to read string as another dtype Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) - assert('could not convert string to float' in exinfo.value) + assert('could not convert string to float' in str(exinfo.value)) From df36872d2aca7205cbd2df5984aa84d7209a5e5c Mon Sep 17 00:00:00 2001 From: Daniel Jalova Date: Tue, 15 Dec 2020 15:35:06 -0800 Subject: [PATCH 6/7] Change key to no_header --- pydax/loaders/_table.py | 10 +++++----- tests/test_loaders.py | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pydax/loaders/_table.py b/pydax/loaders/_table.py index 4bfbae09..0c8121a9 100644 --- a/pydax/loaders/_table.py +++ b/pydax/loaders/_table.py @@ -37,8 +37,8 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic - ``columns`` key specifies the data type of each column. Each data type corresponds to a Pandas' supported dtype. If unspecified, then it is default. - ``delimiter`` key specifies the delimiter of the input CSV file. - - ``header`` key specifies if the first row of the CSV file contains the headers. Defaults to True. - If the value set to anything other than False, it will be treated as True. + - ``no_header`` key specifies if the first row of the CSV file contains the headers. Defaults to False. + If the value is set to anything "truthy" in Python, the first row of the CSV will be read as data. - ``encoding`` key specifies the encoding of the CSV file. Defaults to UTF-8. :raises TypeError: ``path`` is not a path object. """ @@ -59,11 +59,11 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic names = None header = None - if options.get('header', True): - header = 'infer' - else: + if options.get('no_header'): # If no header use the columns provided in schema names = [*options.get('columns', {})] + else: + header = 'infer' return pd.read_csv(path, dtype=dtypes, # The following line after "if" is for circumventing diff --git a/tests/test_loaders.py b/tests/test_loaders.py index e004674a..59e9f1f0 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -247,10 +247,19 @@ def test_csv_pandas_loader_no_encoding(self, tmp_path, noaa_jfk_schema): def test_csv_pandas_header(self, tmp_path, noaa_jfk_schema): "Test CSVPandasLoader header options" - noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['header'] = True - self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) - - noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['header'] = False + noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = True with pytest.raises(ValueError) as exinfo: # Pandas should error from trying to read string as another dtype Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) assert('could not convert string to float' in str(exinfo.value)) + + noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = False + self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) + + noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = '' + self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) + + noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = None + self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) + + del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] + self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) From 671557081460872206e299211937a8d4bcd6f910 Mon Sep 17 00:00:00 2001 From: Daniel Jalova Date: Tue, 15 Dec 2020 17:10:21 -0800 Subject: [PATCH 7/7] Clean up test cases --- tests/test_loaders.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 59e9f1f0..47d1b232 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -252,14 +252,10 @@ def test_csv_pandas_header(self, tmp_path, noaa_jfk_schema): Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) assert('could not convert string to float' in str(exinfo.value)) - noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = False - self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) - - noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = '' - self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) - - noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = None - self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) + false_test_cases = [False, '', None] # These should all be treated as False + for case in false_test_cases: + noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] = case + self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['no_header'] self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema)