Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: switch pytest to unittest #146

Merged
merged 21 commits into from
Jul 8, 2024
4 changes: 2 additions & 2 deletions .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ install:
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
- conda info -a
- conda create -q -n test-environment --channel=conda-forge mmtf-python numpy scipy pandas nose looseversion python=%PYTHON_VERSION%
- conda create -q -n test-environment --channel=conda-forge mmtf-python numpy scipy pandas pytest looseversion python=%PYTHON_VERSION%
- activate test-environment

test_script:
- nosetests -s -v
- pytest -s -v
16 changes: 16 additions & 0 deletions .github/workflows/changelog-enforcer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Changelog Enforcer

on: # yamllint disable-line rule:truthy
pull_request:
types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]

jobs:

changelog:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- uses: dangoslen/changelog-enforcer@v3
with:
skipLabels: 'skip-changelog'
1,213 changes: 1,039 additions & 174 deletions biopandas/constants.py

Large diffs are not rendered by default.

37 changes: 24 additions & 13 deletions biopandas/mmcif/mmcif_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,28 @@ def __init__(self, parser_obj):
self.names_defined = False

def add_name(self, name):
cat_name = type(name) == str and partition_string(name, ".") or ["", "", ""]
cat_name = (
isinstance(name, str) and partition_string(name, ".") or ["", "", ""]
)
if cat_name[1]:
if cat_name[0] not in self.parser_obj.current_target[-2]:
self.parser_obj.current_target[-2][cat_name[0]] = {}
if cat_name[2] not in self.parser_obj.current_target[-2][cat_name[0]]:
self.parser_obj.current_target[-2][cat_name[0]][cat_name[2]] = []
if (
cat_name[2]
not in self.parser_obj.current_target[-2][cat_name[0]]
):
self.parser_obj.current_target[-2][cat_name[0]][
cat_name[2]
] = []
self.ref_list.append(
self.parser_obj.current_target[-2][cat_name[0]][cat_name[2]]
)
else:
if cat_name[0] not in self.parser_obj.current_target[-2]:
self.parser_obj.current_target[-2][cat_name[0]] = []
self.ref_list.append(self.parser_obj.current_target[-2][cat_name[0]])
self.ref_list.append(
self.parser_obj.current_target[-2][cat_name[0]]
)
self.length = len(self.ref_list)

def push_value(self, value):
Expand Down Expand Up @@ -218,16 +227,16 @@ def __repr__(self):
def __cif_float_range__(inp):
try:
pos = inp.index("-", 1)
return (__CIFFloat__(inp[:pos]), __CIFFloat__(inp[pos + 1 :]))
except:
return (__CIFFloat__(inp[:pos]), __CIFFloat__(inp[pos + 1:]))
except Exception:
return (__CIFFloat__(inp),)


def __cif_int_range__(inp):
try:
pos = inp.index("-", 1)
return (__CIFInt__(inp[:pos]), __CIFInt__(inp[pos + 1 :]))
except:
return (__CIFInt__(inp[:pos]), __CIFInt__(inp[pos + 1:]))
except Exception:
return (__CIFInt__(inp),)


Expand All @@ -239,12 +248,12 @@ def __load_cif_dic__(dic_file, force=False):
if force:
throw
dic = json.loads(open(jsf).read())
except:
except Exception:
parser = CIFParser()
parser.parse(open(dic_file))
json.dump(parser.data, open(jsf_dic, "w"))
for k, v in parser.data["data_mmcif_pdbx.dic"].items():
if type(v) != dict or "item_type" not in v:
if not isinstance(v, dict) or "item_type" not in v:
continue
name = partition_string(k[6:], ".")
if name[0] not in dic:
Expand Down Expand Up @@ -285,11 +294,13 @@ def __dump_cif__(jso):
def __dump_str__(inp):
if inp is None:
return "?"
if type(inp) is not str:
if not isinstance(inp, str):
return str(inp)
if re.search(__CIF_STR_NL_CHECK__, inp) is not None:
return "\n;%s\n;" % inp
return "'%s'" % inp if re.search(__CIF_STR_CHECK__, inp) is not None else inp
return (
"'%s'" % inp if re.search(__CIF_STR_CHECK__, inp) is not None else inp
)


def __pad_string__(inp, flength):
Expand Down Expand Up @@ -354,7 +365,7 @@ def __dump_part__(jso):

def load_cif_data(data, do_clean=True, do_type=True):
parser = CIFParser()
if type(data) == str:
if isinstance(data, str):
parser.parse_string(data)
else:
parser.parse(data) # fileobj
Expand Down
144 changes: 98 additions & 46 deletions biopandas/mmcif/pandas_mmcif.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Class for working with MMCIF files."""

# BioPandas
# Authors: Arian Jamasb <[email protected]>,
# Authors: Sebastian Raschka <[email protected]>
Expand Down Expand Up @@ -69,56 +70,76 @@ def read_mmcif(self, path):
self.code = self.data["entry"]["id"][0].lower()
return self

def fetch_mmcif(self, pdb_code: Optional[str] = None, uniprot_id: Optional[str] = None, source: str = "pdb"):
def fetch_mmcif(
self,
pdb_code: Optional[str] = None,
uniprot_id: Optional[str] = None,
source: str = "pdb",
):
"""Fetches mmCIF file contents from the Protein Databank at rcsb.org or AlphaFold database at https://alphafold.ebi.ac.uk/.
.
.

Parameters
----------
pdb_code : str, optional
A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
Parameters
----------
pdb_code : str, optional
A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.

uniprot_id : str, optional
A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
uniprot_id : str, optional
A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.

source : str
The source to retrieve the structure from
(`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
source : str
The source to retrieve the structure from
(`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.

Returns
---------
self
Returns
---------
self

"""
# Sanitize input
invalid_input_identifier_1 = pdb_code is None and uniprot_id is None
invalid_input_identifier_2 = pdb_code is not None and uniprot_id is not None
invalid_input_combination_1 = uniprot_id is not None and source == "pdb"
invalid_input_identifier_2 = (
pdb_code is not None and uniprot_id is not None
)
invalid_input_combination_1 = (
uniprot_id is not None and source == "pdb"
)
invalid_input_combination_2 = pdb_code is not None and source in {
"alphafold2-v3", "alphafold2-v4"}
"alphafold2-v3",
"alphafold2-v4",
}

if invalid_input_identifier_1 or invalid_input_identifier_2:
raise ValueError(
"Please provide either a PDB code or a UniProt ID.")
"Please provide either a PDB code or a UniProt ID."
)

if invalid_input_combination_1:
raise ValueError(
"Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'.")
"Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'."
)
elif invalid_input_combination_2:
raise ValueError(
f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}.")
f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}."
)

if source == "pdb":
self.mmcif_path, self.mmcif_text = self._fetch_mmcif(pdb_code)
elif source == "alphafold2-v3":
af2_version = 3
self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
self.mmcif_path, self.mmcif_text = self._fetch_af2(
uniprot_id, af2_version
)
elif source == "alphafold2-v4":
af2_version = 4
self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
self.mmcif_path, self.mmcif_text = self._fetch_af2(
uniprot_id, af2_version
)
else:
raise ValueError(f"Invalid source: {source}."
" Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'.")
raise ValueError(
f"Invalid source: {source}."
" Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'."
)

self._df = self._construct_df(text=self.mmcif_text)
return self
Expand All @@ -129,7 +150,8 @@ def _construct_df(self, text: str):
self.data = data
df: Dict[str, pd.DataFrame] = {}
full_df = pd.DataFrame.from_dict(
data["atom_site"], orient="index").transpose()
data["atom_site"], orient="index"
).transpose()
full_df = full_df.astype(mmcif_col_types, errors="ignore")
df["ATOM"] = pd.DataFrame(full_df[full_df.group_PDB == "ATOM"])
df["HETATM"] = pd.DataFrame(full_df[full_df.group_PDB == "HETATM"])
Expand All @@ -148,8 +170,9 @@ def _fetch_mmcif(pdb_code):
response = urlopen(url)
txt = response.read()
txt = (
txt.decode(
"utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
txt.decode("utf-8")
if sys.version_info[0] >= 3
else txt.encode("ascii")
)
except HTTPError as e:
print(f"HTTP Error {e.code}")
Expand All @@ -166,11 +189,15 @@ def _fetch_af2(uniprot_id: str, af2_version: int = 3):
try:
response = urlopen(url)
txt = response.read()
txt = txt.decode('utf-8') if sys.version_info[0] >= 3 else txt.encode('ascii')
txt = (
txt.decode("utf-8")
if sys.version_info[0] >= 3
else txt.encode("ascii")
)
except HTTPError as e:
print(f'HTTP Error {e.code}')
print(f"HTTP Error {e.code}")
except URLError as e:
print(f'URL Error {e.args}')
print(f"URL Error {e.args}")
return url, txt

@staticmethod
Expand All @@ -184,7 +211,8 @@ def _read_mmcif(path):
openf = gzip.open
else:
allowed_formats = ", ".join(
(".cif", ".cif.gz", ".mmcif", ".mmcif.gz"))
(".cif", ".cif.gz", ".mmcif", ".mmcif.gz")
)
raise ValueError(
f"Wrong file format; allowed file formats are {allowed_formats}"
)
Expand All @@ -194,8 +222,9 @@ def _read_mmcif(path):

if path.endswith(".gz"):
txt = (
txt.decode(
"utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
txt.decode("utf-8")
if sys.version_info[0] >= 3
else txt.encode("ascii")
)
return path, txt

Expand Down Expand Up @@ -271,14 +300,19 @@ def _get_mainchain(
def _get_hydrogen(df, invert):
"""Return only hydrogen atom entries from a DataFrame"""
return (
df[(df["type_symbol"] != "H")] if invert else df[(
df["type_symbol"] == "H")]
df[(df["type_symbol"] != "H")]
if invert
else df[(df["type_symbol"] == "H")]
)

@staticmethod
def _get_heavy(df, invert):
"""Return only heavy atom entries from a DataFrame"""
return df[df["type_symbol"] == "H"] if invert else df[df["type_symbol"] != "H"]
return (
df[df["type_symbol"] == "H"]
if invert
else df[df["type_symbol"] != "H"]
)

@staticmethod
def _get_calpha(df, invert, atom_col: str = "auth_atom_id"):
Expand All @@ -288,7 +322,11 @@ def _get_calpha(df, invert, atom_col: str = "auth_atom_id"):
@staticmethod
def _get_carbon(df, invert):
"""Return carbon atom entries from a DataFrame"""
return df[df["type_symbol"] != "C"] if invert else df[df["type_symbol"] == "C"]
return (
df[df["type_symbol"] != "C"]
if invert
else df[df["type_symbol"] == "C"]
)

def amino3to1(
self,
Expand Down Expand Up @@ -339,8 +377,9 @@ def amino3to1(
indices.append(ind)
cmp = num

transl = tmp.iloc[indices][residue_col].map(
amino3to1dict).fillna(fillna)
transl = (
tmp.iloc[indices][residue_col].map(amino3to1dict).fillna(fillna)
)

return pd.concat((tmp.iloc[indices][chain_col], transl), axis=1)

Expand Down Expand Up @@ -425,7 +464,9 @@ def distance(self, xyz=(0.00, 0.00, 0.00), records=("ATOM", "HETATM")):

return np.sqrt(
np.sum(
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) ** 2, axis=1
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1)
** 2,
axis=1,
)
)

Expand All @@ -451,7 +492,9 @@ def distance_df(df, xyz=(0.00, 0.00, 0.00)):
"""
return np.sqrt(
np.sum(
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) ** 2, axis=1
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1)
** 2,
axis=1,
)
)

Expand Down Expand Up @@ -485,7 +528,11 @@ def read_mmcif_from_list(self, mmcif_lines):
self.code = self.data["entry"]["id"][0].lower()
return self

def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] = ["ATOM", "HETATM"]) -> PandasPdb:
def convert_to_pandas_pdb(
self,
offset_chains: bool = True,
records: List[str] = ["ATOM", "HETATM"],
) -> PandasPdb:
"""Returns a PandasPdb object with the same data as the PandasMmcif
object.

Expand Down Expand Up @@ -525,10 +572,15 @@ def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] =

# Update atom numbers
if offset_chains:
offsets = pandaspdb.df["ATOM"]["chain_id"].astype(
"category").cat.codes
pandaspdb.df["ATOM"]["atom_number"] = pandaspdb.df["ATOM"]["atom_number"] + offsets
offsets = (
pandaspdb.df["ATOM"]["chain_id"].astype("category").cat.codes
)
pandaspdb.df["ATOM"]["atom_number"] = (
pandaspdb.df["ATOM"]["atom_number"] + offsets
)
hetatom_offset = offsets.max() + 1
pandaspdb.df["HETATM"]["atom_number"] = pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
pandaspdb.df["HETATM"]["atom_number"] = (
pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
)

return pandaspdb
Loading
Loading