From d6ba9cccddf3b6c7a09ce0095ddac5cf84d9fe79 Mon Sep 17 00:00:00 2001 From: Daniel Ji Date: Tue, 13 Aug 2024 18:16:07 -0700 Subject: [PATCH 1/7] add read_string --- src/mdocfile/__init__.py | 2 +- src/mdocfile/data_models.py | 12 +++++++++++- src/mdocfile/functions.py | 20 ++++++++++++++++++++ tests/conftest.py | 7 ++++++- tests/test_functions.py | 7 ++++++- 5 files changed, 44 insertions(+), 4 deletions(-) diff --git a/src/mdocfile/__init__.py b/src/mdocfile/__init__.py index 8acbf99..83d903f 100644 --- a/src/mdocfile/__init__.py +++ b/src/mdocfile/__init__.py @@ -1 +1 @@ -from .functions import read +from .functions import read, read_string diff --git a/src/mdocfile/data_models.py b/src/mdocfile/data_models.py index 0bd5b84..05acb07 100644 --- a/src/mdocfile/data_models.py +++ b/src/mdocfile/data_models.py @@ -170,7 +170,17 @@ class Mdoc(BaseModel): @classmethod def from_file(cls, filename: str): with open(filename) as file: - lines = [line.strip() for line in file.readlines()] + return cls.from_lines(file.readlines()) + + @classmethod + def from_string(cls, string: str): + lines = string.split('\n') + + return cls.from_lines(lines) + + @classmethod + def from_lines(cls, file_lines: List[str]): + lines = [line.strip() for line in file_lines] split_idxs = find_section_entries(lines) split_idxs.append(len(lines)) diff --git a/src/mdocfile/functions.py b/src/mdocfile/functions.py index 78036c7..f12ebbe 100644 --- a/src/mdocfile/functions.py +++ b/src/mdocfile/functions.py @@ -19,6 +19,26 @@ def read(filename: PathLike) -> pd.DataFrame: dataframe containing info from mdoc file """ mdoc = Mdoc.from_file(filename) + return _read_helper(mdoc) + +def read_string(string: str) -> pd.DataFrame: + """Read an mdoc string as a pandas dataframe. + + Parameters + ---------- + string : str + SerialEM mdoc string data to read + + Returns + ------- + df : pd.DataFrame + dataframe containing info from mdoc file + """ + mdoc = Mdoc.from_string(string) + return _read_helper(mdoc) + +def _read_helper(mdoc: Mdoc) -> pd.DataFrame: + """Helper function to read an mdoc file as a pandas dataframe.""" global_data = mdoc.global_data.model_dump() section_data = { k: [section.model_dump()[k] for section in mdoc.section_data] diff --git a/tests/conftest.py b/tests/conftest.py index da19cee..4308e0f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,16 @@ -import pytest from pathlib import Path +import pytest + @pytest.fixture def tilt_series_mdoc_file(): return Path(__file__).parent / 'test_data' / 'tilt_series.mdoc' +@pytest.fixture +def tilt_series_mdoc_string(): + with open(Path(__file__).parent / 'test_data' / 'tilt_series.mdoc') as f: + return f.read() @pytest.fixture def montage_section_mdoc_file(): diff --git a/tests/test_functions.py b/tests/test_functions.py index ec2b4f0..f664e96 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,6 +1,6 @@ import pandas as pd -from mdocfile import read +from mdocfile import read, read_string def test_read_tilt_series_mdoc(tilt_series_mdoc_file): @@ -9,6 +9,11 @@ def test_read_tilt_series_mdoc(tilt_series_mdoc_file): assert df.shape == (41, 26) assert 'TiltAngle' in df.columns +def test_read_tilt_series_mdoc_string(tilt_series_mdoc_string): + df = read_string(tilt_series_mdoc_string) + assert isinstance(df, pd.DataFrame) + assert df.shape == (41, 26) + assert 'TiltAngle' in df.columns def test_read_montage_section_mdoc(montage_section_mdoc_file): df = read(montage_section_mdoc_file) From 18cdc4c5015bd187a730a6bec47506fefab8c37a Mon Sep 17 00:00:00 2001 From: Daniel Ji Date: Tue, 13 Aug 2024 18:21:52 -0700 Subject: [PATCH 2/7] update docs with read_string function --- README.md | 9 +++++++++ docs/index.md | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/README.md b/README.md index ec2c00b..9a64450 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,15 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` +`mdocfile.read_string()` will read mdoc file data stored in a string + +```python + +import mdocfile + +df = mdocfile.read_string('...mdoc string data...') +``` + For writing valid mdoc files, please see [writing mdoc files](https://teamtomo.org/mdocfile/writing/). diff --git a/docs/index.md b/docs/index.md index 4f872f1..46c80f2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -29,6 +29,15 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` +`mdocfile.read_string()` will read mdoc file data stored in a string + +```python + +import mdocfile + +df = mdocfile.read_string('...mdoc string data...') +``` + --- For writing valid mdoc files, please see [writing mdoc files](./writing.md). From a9ddf10eebfc831d66aa7aead831cbf5c08dcb08 Mon Sep 17 00:00:00 2001 From: Daniel Ji Date: Tue, 13 Aug 2024 18:27:08 -0700 Subject: [PATCH 3/7] small doc fixes --- README.md | 4 +--- docs/index.md | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 9a64450..6778f42 100644 --- a/README.md +++ b/README.md @@ -28,12 +28,10 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` -`mdocfile.read_string()` will read mdoc file data stored in a string +`mdocfile.read_string()` will read mdoc file data stored in a string. ```python -import mdocfile - df = mdocfile.read_string('...mdoc string data...') ``` diff --git a/docs/index.md b/docs/index.md index 46c80f2..3aea70a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -29,12 +29,10 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` -`mdocfile.read_string()` will read mdoc file data stored in a string +`mdocfile.read_string()` will read mdoc file data stored in a string. ```python -import mdocfile - df = mdocfile.read_string('...mdoc string data...') ``` From 30dca94934534f8acc6791e36c0acf22e19f7ad1 Mon Sep 17 00:00:00 2001 From: Daniel Ji Date: Tue, 13 Aug 2024 23:58:05 -0700 Subject: [PATCH 4/7] refactor read_string to be Mdoc method --- README.md | 7 ------- docs/index.md | 7 ------- src/mdocfile/__init__.py | 1 - src/mdocfile/data_models.py | 23 ++++++++++++++++++++++- src/mdocfile/functions.py | 37 +------------------------------------ tests/test_functions.py | 5 +++-- 6 files changed, 26 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 6778f42..ec2c00b 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,6 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` -`mdocfile.read_string()` will read mdoc file data stored in a string. - -```python - -df = mdocfile.read_string('...mdoc string data...') -``` - For writing valid mdoc files, please see [writing mdoc files](https://teamtomo.org/mdocfile/writing/). diff --git a/docs/index.md b/docs/index.md index 3aea70a..4f872f1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -29,13 +29,6 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` -`mdocfile.read_string()` will read mdoc file data stored in a string. - -```python - -df = mdocfile.read_string('...mdoc string data...') -``` - --- For writing valid mdoc files, please see [writing mdoc files](./writing.md). diff --git a/src/mdocfile/__init__.py b/src/mdocfile/__init__.py index 83d903f..e69de29 100644 --- a/src/mdocfile/__init__.py +++ b/src/mdocfile/__init__.py @@ -1 +0,0 @@ -from .functions import read, read_string diff --git a/src/mdocfile/data_models.py b/src/mdocfile/data_models.py index 05acb07..2e30eac 100644 --- a/src/mdocfile/data_models.py +++ b/src/mdocfile/data_models.py @@ -1,3 +1,4 @@ +import pandas as pd from pydantic import field_validator, BaseModel from pathlib import Path, PureWindowsPath from typing import List, Optional, Tuple, Union, Sequence @@ -179,7 +180,7 @@ def from_string(cls, string: str): return cls.from_lines(lines) @classmethod - def from_lines(cls, file_lines: List[str]): + def from_lines(cls, file_lines: List[str]) -> 'Mdoc': lines = [line.strip() for line in file_lines] split_idxs = find_section_entries(lines) split_idxs.append(len(lines)) @@ -195,6 +196,26 @@ def from_lines(cls, file_lines: List[str]): in zip(split_idxs, split_idxs[1:]) ] return cls(titles=titles, global_data=global_data, section_data=section_data) + + def as_dataframe(self) -> pd.DataFrame: + """ + Convert an Mdoc object to a pandas DataFrame + """ + global_data = self.global_data.model_dump() + section_data = { + k: [section.model_dump()[k] for section in self.section_data] + for k + in self.section_data[0].model_dump().keys() + } + df = pd.DataFrame(data=section_data) + + # add duplicate copies of global data and mdoc file titles to each row of + # the dataframe - tidy data is easier to analyse + for k, v in global_data.items(): + df[k] = [v] * len(df) + df['titles'] = [self.titles] * len(df) + df = df.dropna(axis='columns', how='all') + return df def to_string(self): """ diff --git a/src/mdocfile/functions.py b/src/mdocfile/functions.py index f12ebbe..3ec1e3e 100644 --- a/src/mdocfile/functions.py +++ b/src/mdocfile/functions.py @@ -18,39 +18,4 @@ def read(filename: PathLike) -> pd.DataFrame: df : pd.DataFrame dataframe containing info from mdoc file """ - mdoc = Mdoc.from_file(filename) - return _read_helper(mdoc) - -def read_string(string: str) -> pd.DataFrame: - """Read an mdoc string as a pandas dataframe. - - Parameters - ---------- - string : str - SerialEM mdoc string data to read - - Returns - ------- - df : pd.DataFrame - dataframe containing info from mdoc file - """ - mdoc = Mdoc.from_string(string) - return _read_helper(mdoc) - -def _read_helper(mdoc: Mdoc) -> pd.DataFrame: - """Helper function to read an mdoc file as a pandas dataframe.""" - global_data = mdoc.global_data.model_dump() - section_data = { - k: [section.model_dump()[k] for section in mdoc.section_data] - for k - in mdoc.section_data[0].model_dump().keys() - } - df = pd.DataFrame(data=section_data) - - # add duplicate copies of global data and mdoc file titles to each row of - # the dataframe - tidy data is easier to analyse - for k, v in global_data.items(): - df[k] = [v] * len(df) - df['titles'] = [mdoc.titles] * len(df) - df = df.dropna(axis='columns', how='all') - return df + return Mdoc.from_file(filename).as_dataframe() diff --git a/tests/test_functions.py b/tests/test_functions.py index f664e96..ced4c2c 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,6 +1,7 @@ import pandas as pd -from mdocfile import read, read_string +from mdocfile import read +from mdocfile.data_models import Mdoc def test_read_tilt_series_mdoc(tilt_series_mdoc_file): @@ -10,7 +11,7 @@ def test_read_tilt_series_mdoc(tilt_series_mdoc_file): assert 'TiltAngle' in df.columns def test_read_tilt_series_mdoc_string(tilt_series_mdoc_string): - df = read_string(tilt_series_mdoc_string) + df = Mdoc.from_string(tilt_series_mdoc_string).as_dataframe() assert isinstance(df, pd.DataFrame) assert df.shape == (41, 26) assert 'TiltAngle' in df.columns From 219b552ec23002dc0f343182fd4a359e494f60ad Mon Sep 17 00:00:00 2001 From: Daniel Ji Date: Wed, 14 Aug 2024 00:01:21 -0700 Subject: [PATCH 5/7] fix __init__.py --- src/mdocfile/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mdocfile/__init__.py b/src/mdocfile/__init__.py index e69de29..1cb2442 100644 --- a/src/mdocfile/__init__.py +++ b/src/mdocfile/__init__.py @@ -0,0 +1 @@ +from .functions import read \ No newline at end of file From 935c450de421664b67e87c4064864bd5c78ebe08 Mon Sep 17 00:00:00 2001 From: Daniel Ji Date: Wed, 14 Aug 2024 00:28:34 -0700 Subject: [PATCH 6/7] update docs with Mdoc.from_string().as_dataframe() --- README.md | 11 +++++++++++ docs/index.md | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/README.md b/README.md index ec2c00b..2ae2f77 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,17 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` +`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. +This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). + +```python +from mdocfile.data_models import Mdoc + +mdoc_data = ... + +mdoc = Mdoc.from_string(mdoc_data).as_dataframe() +``` + For writing valid mdoc files, please see [writing mdoc files](https://teamtomo.org/mdocfile/writing/). diff --git a/docs/index.md b/docs/index.md index 4f872f1..140ec9a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -29,6 +29,17 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` +`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. +This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). + +```python +from mdocfile.data_models import Mdoc + +mdoc_data = ... + +mdoc = Mdoc.from_string(mdoc_data).as_dataframe() +``` + --- For writing valid mdoc files, please see [writing mdoc files](./writing.md). From aa11c130925ed45369107b0a707c5e0aa0fe7e19 Mon Sep 17 00:00:00 2001 From: Daniel Ji Date: Wed, 14 Aug 2024 00:51:49 -0700 Subject: [PATCH 7/7] update docs --- README.md | 26 +++++++++++++------------- docs/index.md | 28 ++++++++++++++++------------ 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 2ae2f77..58996fd 100644 --- a/README.md +++ b/README.md @@ -28,22 +28,9 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` -`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. -This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). - -```python -from mdocfile.data_models import Mdoc - -mdoc_data = ... - -mdoc = Mdoc.from_string(mdoc_data).as_dataframe() -``` - For writing valid mdoc files, please see [writing mdoc files](https://teamtomo.org/mdocfile/writing/). - - # Installation pip: @@ -51,3 +38,16 @@ pip: ```shell pip install mdocfile ``` + +# Parsing from text + +`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. +This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). + +```python +from mdocfile.data_models import Mdoc + +mdoc_data = ... + +mdoc = Mdoc.from_string(mdoc_data).as_dataframe() +``` diff --git a/docs/index.md b/docs/index.md index 140ec9a..fefda90 100644 --- a/docs/index.md +++ b/docs/index.md @@ -29,25 +29,29 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` -`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. -This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). +For writing valid mdoc files, please see [writing mdoc files](./writing.md). -```python -from mdocfile.data_models import Mdoc +--- -mdoc_data = ... +# Installation -mdoc = Mdoc.from_string(mdoc_data).as_dataframe() +pip: + +```shell +pip install mdocfile ``` --- -For writing valid mdoc files, please see [writing mdoc files](./writing.md). +# Parsing from text -# Installation +`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. +This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). -pip: +```python +from mdocfile.data_models import Mdoc -```shell -pip install mdocfile -``` \ No newline at end of file +mdoc_data = ... + +mdoc = Mdoc.from_string(mdoc_data).as_dataframe() +```