Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read mdocfile data from a string #28

Merged
merged 7 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,23 @@ df = mdocfile.read('my_mdoc_file.mdoc')
For writing valid mdoc files, please see
[writing mdoc files](https://teamtomo.org/mdocfile/writing/).



# Installation

pip:

```shell
pip install mdocfile
```

# Parsing from text

`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe.
This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request).

```python
from mdocfile.data_models import Mdoc

mdoc_data = ...

mdoc = Mdoc.from_string(mdoc_data).as_dataframe()
```
21 changes: 18 additions & 3 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,29 @@ import mdocfile
df = mdocfile.read('my_mdoc_file.mdoc')
```

---

For writing valid mdoc files, please see [writing mdoc files](./writing.md).

---

# Installation

pip:

```shell
pip install mdocfile
```
```

---

# Parsing from text

`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe.
This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request).

```python
from mdocfile.data_models import Mdoc

mdoc_data = ...

mdoc = Mdoc.from_string(mdoc_data).as_dataframe()
```
2 changes: 1 addition & 1 deletion src/mdocfile/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .functions import read
from .functions import read
33 changes: 32 additions & 1 deletion src/mdocfile/data_models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pandas as pd
from pydantic import field_validator, BaseModel
from pathlib import Path, PureWindowsPath
from typing import List, Optional, Tuple, Union, Sequence
Expand Down Expand Up @@ -170,7 +171,17 @@ class Mdoc(BaseModel):
@classmethod
def from_file(cls, filename: str):
with open(filename) as file:
lines = [line.strip() for line in file.readlines()]
return cls.from_lines(file.readlines())

@classmethod
def from_string(cls, string: str):
lines = string.split('\n')

return cls.from_lines(lines)

@classmethod
def from_lines(cls, file_lines: List[str]) -> 'Mdoc':
lines = [line.strip() for line in file_lines]
split_idxs = find_section_entries(lines)
split_idxs.append(len(lines))

Expand All @@ -185,6 +196,26 @@ def from_file(cls, filename: str):
in zip(split_idxs, split_idxs[1:])
]
return cls(titles=titles, global_data=global_data, section_data=section_data)

def as_dataframe(self) -> pd.DataFrame:
"""
Convert an Mdoc object to a pandas DataFrame
"""
global_data = self.global_data.model_dump()
section_data = {
k: [section.model_dump()[k] for section in self.section_data]
for k
in self.section_data[0].model_dump().keys()
}
df = pd.DataFrame(data=section_data)

# add duplicate copies of global data and mdoc file titles to each row of
# the dataframe - tidy data is easier to analyse
for k, v in global_data.items():
df[k] = [v] * len(df)
df['titles'] = [self.titles] * len(df)
df = df.dropna(axis='columns', how='all')
return df

def to_string(self):
"""
Expand Down
17 changes: 1 addition & 16 deletions src/mdocfile/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,4 @@ def read(filename: PathLike) -> pd.DataFrame:
df : pd.DataFrame
dataframe containing info from mdoc file
"""
mdoc = Mdoc.from_file(filename)
global_data = mdoc.global_data.model_dump()
section_data = {
k: [section.model_dump()[k] for section in mdoc.section_data]
for k
in mdoc.section_data[0].model_dump().keys()
}
df = pd.DataFrame(data=section_data)

# add duplicate copies of global data and mdoc file titles to each row of
# the dataframe - tidy data is easier to analyse
for k, v in global_data.items():
df[k] = [v] * len(df)
df['titles'] = [mdoc.titles] * len(df)
df = df.dropna(axis='columns', how='all')
return df
return Mdoc.from_file(filename).as_dataframe()
7 changes: 6 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import pytest
from pathlib import Path

import pytest


@pytest.fixture
def tilt_series_mdoc_file():
return Path(__file__).parent / 'test_data' / 'tilt_series.mdoc'

@pytest.fixture
def tilt_series_mdoc_string():
with open(Path(__file__).parent / 'test_data' / 'tilt_series.mdoc') as f:
return f.read()

@pytest.fixture
def montage_section_mdoc_file():
Expand Down
6 changes: 6 additions & 0 deletions tests/test_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd

from mdocfile import read
from mdocfile.data_models import Mdoc


def test_read_tilt_series_mdoc(tilt_series_mdoc_file):
Expand All @@ -9,6 +10,11 @@ def test_read_tilt_series_mdoc(tilt_series_mdoc_file):
assert df.shape == (41, 26)
assert 'TiltAngle' in df.columns

def test_read_tilt_series_mdoc_string(tilt_series_mdoc_string):
df = Mdoc.from_string(tilt_series_mdoc_string).as_dataframe()
assert isinstance(df, pd.DataFrame)
assert df.shape == (41, 26)
assert 'TiltAngle' in df.columns

def test_read_montage_section_mdoc(montage_section_mdoc_file):
df = read(montage_section_mdoc_file)
Expand Down
Loading