Skip to content

Commit

Permalink
refactor reading of PDB and XDS_ASCII files
Browse files Browse the repository at this point in the history
move more code from headers to cpp files

add LineReader helper class that provides type erasure - this allows
to reduce the usage of templates when reading pdb and xds files
  • Loading branch information
wojdyr committed Nov 19, 2024
1 parent 7a4478e commit 4e08d3d
Show file tree
Hide file tree
Showing 6 changed files with 630 additions and 633 deletions.
37 changes: 26 additions & 11 deletions include/gemmi/input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,17 +151,32 @@ class BasicInput {
std::string path_;
};

template<typename Input>
inline size_t copy_line_from_stream(char* line, int size, Input&& in) {
if (!in.gets(line, size))
return 0;
size_t len = std::strlen(line);
// If a line is longer than size we discard the rest of it.
if (len > 0 && line[len-1] != '\n')
for (int c = in.getc(); c > 0 /* not 0 nor EOF */ && c != '\n'; c = in.getc())
continue;
return len;
}
/// type-erased line-based reader
class LineReaderBase {
public:
virtual size_t copy_line(char* line, int size) = 0;
};

template<typename T>
class LineReader final : public LineReaderBase {
public:
template<typename... Args> LineReader(Args&& ...args)
: stream_{std::forward<Args>(args)...} {}

size_t copy_line(char* line, int size) {
if (!stream_.gets(line, size))
return 0;
size_t len = std::strlen(line);
// If a line is longer than size we discard the rest of it.
if (len > 0 && line[len-1] != '\n')
for (int c = stream_.getc(); c > 0 /* not 0 nor EOF */ && c != '\n'; c = stream_.getc())
continue;
return len;
};

private:
T stream_;
};

} // namespace gemmi
#endif
74 changes: 18 additions & 56 deletions include/gemmi/pdb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,74 +21,33 @@

namespace gemmi {

/// Returns operations corresponding to 1555, 2555, ... N555
GEMMI_DLL std::vector<Op> read_remark_290(const std::vector<std::string>& raw_remarks);

namespace impl {

// Compare the first 4 letters of s, ignoring case, with uppercase record.
// Both args must have at least 3+1 chars. ' ' and NUL are equivalent in s.
inline bool is_record_type(const char* s, const char* record) {
/// Compare the first 4 letters of s, ignoring case, with uppercase record.
/// Both args must have at least 3+1 chars. ' ' and NUL are equivalent in s.
inline bool is_record_type4(const char* s, const char* record) {
return ialpha4_id(s) == ialpha4_id(record);
}
// for record "TER": "TER ", TER\n, TER\r, TER\t match, TERE, TER1 don't
/// for record "TER": "TER ", TER\n, TER\r, TER\t match, TERE, TER1 don't
inline bool is_record_type3(const char* s, const char* record) {
return (ialpha4_id(s) & ~0xf) == ialpha4_id(record);
}

struct GEMMI_DLL PdbReader {
PdbReader(const PdbReadOptions& options_) : options(options_) {
if (options.max_line_length <= 0 || options.max_line_length > 120)
options.max_line_length = 120;
}

template<typename Stream>
Structure from_stream(Stream&& stream, const std::string& source) {
Structure st;
st.input_format = CoorFormat::Pdb;
st.name = path_basename(source, {".gz", ".pdb"});
char line[122] = {0};
while (size_t len = copy_line_from_stream(line, options.max_line_length+1, stream)) {
++line_num;
read_pdb_line(line, len, st, source);
if (is_end)
break;
}
finalize_structure_after_reading_pdb(st);
return st;
}

private:
int line_num = 0;
bool after_ter = false;
bool is_end = false;
PdbReadOptions options;
Model *model = nullptr;
Chain *chain = nullptr;
Residue *resi = nullptr;
Transform matrix;
std::vector<std::string> conn_records;
std::unordered_map<ResidueId, int> resmap;

[[noreturn]] void wrong(const std::string& msg) const {
fail("Problem in line ", std::to_string(line_num), ": " + msg);
}
void read_pdb_line(const char* line, size_t len, Structure& st, const std::string& source);
void finalize_structure_after_reading_pdb(Structure& st) const;
};
/// Returns operations corresponding to 1555, 2555, ... N555
GEMMI_DLL std::vector<Op> read_remark_290(const std::vector<std::string>& raw_remarks);

} // namespace impl
GEMMI_DLL Structure read_pdb_from_stream(LineReaderBase&& line_reader,
const std::string& source,
PdbReadOptions options);

inline Structure read_pdb_file(const std::string& path,
PdbReadOptions options=PdbReadOptions()) {
auto f = file_open(path.c_str(), "rb");
return impl::PdbReader(options).from_stream(FileStream{f.get()}, path);
return read_pdb_from_stream(LineReader<FileStream>{f.get()}, path, options);
}

inline Structure read_pdb_from_memory(const char* data, size_t size,
const std::string& name,
PdbReadOptions options=PdbReadOptions()) {
return impl::PdbReader(options).from_stream(MemoryStream(data, size), name);
return read_pdb_from_stream(LineReader<MemoryStream>{data, size}, name, options);
}

inline Structure read_pdb_string(const std::string& str,
Expand All @@ -100,10 +59,13 @@ inline Structure read_pdb_string(const std::string& str,
// A function for transparent reading of stdin and/or gzipped files.
template<typename T>
inline Structure read_pdb(T&& input, PdbReadOptions options=PdbReadOptions()) {
if (input.is_stdin())
return impl::PdbReader(options).from_stream(FileStream{stdin}, "stdin");
if (input.is_compressed())
return impl::PdbReader(options).from_stream(input.get_uncompressing_stream(), input.path());
if (input.is_stdin()) {
return read_pdb_from_stream(LineReader<FileStream>{stdin}, "stdin", options);
}
if (input.is_compressed()) {
using LR = LineReader<decltype(input.get_uncompressing_stream())>;
return read_pdb_from_stream(LR{input.get_uncompressing_stream()}, input.path(), options);
}
return read_pdb_file(input.path(), options);
}

Expand Down
224 changes: 8 additions & 216 deletions include/gemmi/xds_ascii.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,43 +84,23 @@ struct GEMMI_DLL XdsAscii {
isets.emplace_back(id);
return isets.back();
}
template<typename Stream>
void read_stream(Stream&& stream, const std::string& source);
void read_stream(LineReaderBase&& reader, const std::string& source);

template<typename T>
void read_input(T&& input) {
if (input.is_stdin()) {
read_stream(FileStream{stdin}, "stdin");
read_stream(LineReader<FileStream>{stdin}, "stdin");
} else if (input.is_compressed()) {
read_stream(input.get_uncompressing_stream(), input.path());
using LR = LineReader<decltype(input.get_uncompressing_stream())>;
read_stream(LR{input.get_uncompressing_stream()}, input.path());
} else {
auto f = file_open(input.path().c_str(), "r");
read_stream(FileStream{f.get()}, input.path());
read_stream(LineReader<FileStream>{f.get()}, input.path());
}
}

void gather_iset_statistics() {
for (Iset& iset : isets) {
iset.frame_number_min = INT_MAX;
iset.frame_number_max = 0;
for (const XdsAscii::Refl& refl : data)
if (refl.iset == iset.id) {
++iset.reflection_count;
int frame = refl.frame();
iset.frame_number_min = std::min(iset.frame_number_min, frame);
iset.frame_number_max = std::max(iset.frame_number_max, frame);
}
if (iset.frame_number_min > iset.frame_number_max)
continue;
std::vector<uint8_t> frames(iset.frame_number_max - iset.frame_number_min + 1);
for (const XdsAscii::Refl& refl : data)
if (refl.iset == iset.id)
frames[refl.frame() - iset.frame_number_min] = 1;
iset.frame_count = 0;
for (uint8_t f : frames)
iset.frame_count += f;
}
}
// set a few Iset properties in isets
void gather_iset_statistics();

double rot_angle(const Refl& refl) const {
double z = refl.zd - starting_frame + 1;
Expand Down Expand Up @@ -193,198 +173,10 @@ struct GEMMI_DLL XdsAscii {
}
};

template<size_t N>
bool starts_with_ptr(const char* a, const char (&b)[N], const char** endptr) {
if (std::strncmp(a, b, N-1) != 0)
return false;
*endptr = a + N - 1;
return true;
}

template<size_t N>
bool starts_with_ptr_b(const char* a, const char (&b)[N], const char** endptr) {
return starts_with_ptr<N>(skip_blank(a), b, endptr);
}

inline const char* parse_number_into(const char* start, const char* end,
double& val, const char* line) {
auto result = fast_from_chars(start, end, val);
if (result.ec != std::errc())
fail("failed to parse number in:\n", line);
return result.ptr;
}

template<int N>
void parse_numbers_into_array(const char* start, const char* end,
double (&arr)[N], const char* line) {
for (int i = 0; i < N; ++i) {
auto result = fast_from_chars(start, end, arr[i]);
if (result.ec != std::errc())
fail("failed to parse number #", i+1, " in:\n", line);
start = result.ptr;
}
}

template<typename Stream>
void XdsAscii::read_stream(Stream&& stream, const std::string& source) {
source_path = source;
read_columns = 12;
char line[256];
size_t len0 = copy_line_from_stream(line, 255, stream);
int iset_col = 0;
if (len0 == 0 || !(starts_with(line, "!FORMAT=XDS_ASCII MERGE=FALSE") ||
(starts_with(line, "!OUTPUT_FILE=INTEGRATE.HKL"))))
fail("not an unmerged XDS_ASCII nor INTEGRATE.HKL file: " + source_path);
const char* rhs;
while (size_t len = copy_line_from_stream(line, 255, stream)) {
if (line[0] == '!') {
if (starts_with_ptr(line+1, "Generated by ", &rhs)) {
generated_by = read_word(rhs, &rhs);
version_str = trim_str(rhs);
} else if (starts_with_ptr(line+1, "SPACE_GROUP_NUMBER=", &rhs)) {
spacegroup_number = simple_atoi(rhs);
} else if (starts_with_ptr(line+1, "UNIT_CELL_", &rhs)) {
if (starts_with_ptr(rhs, "CONSTANTS=", &rhs)) { // UNIT_CELL_CONSTANTS=
parse_numbers_into_array(rhs, line+len, cell_constants, line);
} else if (starts_with_ptr(rhs, "A-AXIS=", &rhs)) { // UNIT_CELL_A-AXIS=
parse_numbers_into_array(rhs, line+len, cell_axes.a[0], line);
} else if (starts_with_ptr(rhs, "B-AXIS=", &rhs)) { // UNIT_CELL_B-AXIS=
parse_numbers_into_array(rhs, line+len, cell_axes.a[1], line);
} else if (starts_with_ptr(rhs, "C-AXIS=", &rhs)) { // UNIT_CELL_C-AXIS=
parse_numbers_into_array(rhs, line+len, cell_axes.a[2], line);
}
} else if (starts_with_ptr(line+1, "REFLECTING_RANGE_E.S.D.=", &rhs)) {
auto result = fast_from_chars(rhs, line+len, reflecting_range_esd);
if (result.ec != std::errc())
fail("failed to parse mosaicity:\n", line);
} else if (starts_with_ptr(line+1, "X-RAY_WAVELENGTH=", &rhs)) {
auto result = fast_from_chars(rhs, line+len, wavelength);
if (result.ec != std::errc())
fail("failed to parse wavelength:\n", line);
} else if (starts_with_ptr(line+1, "INCIDENT_BEAM_DIRECTION=", &rhs)) {
parse_numbers_into_array(rhs, line+len, incident_beam_dir, line);
} else if (starts_with_ptr(line+1, "OSCILLATION_RANGE=", &rhs)) {
auto result = fast_from_chars(rhs, line+len, oscillation_range);
if (result.ec != std::errc())
fail("failed to parse:\n", line);
} else if (starts_with_ptr(line+1, "ROTATION_AXIS=", &rhs)) {
parse_numbers_into_array(rhs, line+len, rotation_axis, line);
} else if (starts_with_ptr(line+1, "STARTING_ANGLE=", &rhs)) {
auto result = fast_from_chars(rhs, line+len, starting_angle);
if (result.ec != std::errc())
fail("failed to parse:\n", line);
} else if (starts_with_ptr(line+1, "STARTING_FRAME=", &rhs)) {
starting_frame = simple_atoi(rhs);
} else if (starts_with_ptr(line+1, " ISET= ", &rhs)) {
const char* endptr;
int id = simple_atoi(rhs, &endptr);
XdsAscii::Iset& iset = find_or_add_iset(id);
endptr = skip_blank(endptr);
if (starts_with_ptr(endptr, "INPUT_FILE=", &rhs)) {
iset.input_file = read_word(rhs);
} else if (starts_with_ptr(endptr, "X-RAY_WAVELENGTH=", &rhs)) {
double w;
auto result = fast_from_chars(rhs, line+len, w);
if (result.ec != std::errc())
fail("failed to parse iset wavelength:\n", line);
iset.wavelength = w;
} else if (starts_with_ptr(endptr, "UNIT_CELL_CONSTANTS=", &rhs)) {
parse_numbers_into_array(rhs, line+len, iset.cell_constants, line);
}
} else if (starts_with_ptr(line+1, "NX=", &rhs)) {
const char* endptr;
nx = simple_atoi(rhs, &endptr);
if (starts_with_ptr_b(endptr, "NY=", &rhs))
ny = simple_atoi(rhs, &endptr);
if (starts_with_ptr_b(endptr, "QX=", &rhs))
endptr = parse_number_into(rhs, line+len, qx, line);
if (starts_with_ptr_b(endptr, "QY=", &rhs))
parse_number_into(rhs, line+len, qy, line);
} else if (starts_with_ptr(line+1, "ORGX=", &rhs)) {
const char* endptr = parse_number_into(rhs, line+len, orgx, line);
if (starts_with_ptr_b(endptr, "ORGY=", &rhs))
endptr = parse_number_into(rhs, line+len, orgy, line);
if (starts_with_ptr_b(endptr, "DETECTOR_DISTANCE=", &rhs))
parse_number_into(rhs, line+len, detector_distance, line);
} else if (starts_with_ptr(line+1, "NUMBER_OF_ITEMS_IN_EACH_DATA_RECORD=", &rhs)) {
int num = simple_atoi(rhs);
// INTEGRATE.HKL has read_columns=12, as set above
if (generated_by == "XSCALE")
read_columns = 8;
else if (generated_by == "CORRECT")
read_columns = 11;
// check if the columns are what they always are
if (num < read_columns)
fail("expected ", std::to_string(read_columns), "+ columns, got:\n", line);
if (generated_by == "INTEGRATE") {
copy_line_from_stream(line, 52, stream);
if (!starts_with(line, "!H,K,L,IOBS,SIGMA,XCAL,YCAL,ZCAL,RLP,PEAK,CORR,MAXC"))
fail("unexpected column order in INTEGRATE.HKL");
} else {
const char* expected_columns[12] = {
"H=1", "K=2", "L=3", "IOBS=4", "SIGMA(IOBS)=5",
"XD=6", "YD=7", "ZD=8", "RLP=9", "PEAK=10", "CORR=11", "MAXC=12"
};
for (int i = 0; i < read_columns; ++i) {
const char* col = expected_columns[i];
copy_line_from_stream(line, 42, stream);
if (std::strncmp(line, "!ITEM_", 6) != 0 ||
std::strncmp(line+6, col, std::strlen(col)) != 0)
fail("column !ITEM_" + std::string(col), " not found.");
}
}
} else if (starts_with_ptr(line+1, "ITEM_ISET=", &rhs)) {
iset_col = simple_atoi(rhs);
} else if (starts_with(line+1, "END_OF_DATA")) {
if (isets.empty()) {
isets.emplace_back(1);
isets.back().wavelength = wavelength;
}
for (XdsAscii::Refl& refl : data)
if (size_t(refl.iset - 1) >= isets.size())
fail("unexpected ITEM_ISET " + std::to_string(refl.iset));
return;
}
} else {
data.emplace_back();
XdsAscii::Refl& r = data.back();
const char* p = line;
for (int i = 0; i < 3; ++i)
r.hkl[i] = simple_atoi(p, &p);
auto result = fast_from_chars(p, line+len, r.iobs); // 4
result = fast_from_chars(result.ptr, line+len, r.sigma); // 5
result = fast_from_chars(result.ptr, line+len, r.xd); // 6
result = fast_from_chars(result.ptr, line+len, r.yd); // 7
result = fast_from_chars(result.ptr, line+len, r.zd); // 8
if (read_columns >= 11) {
result = fast_from_chars(result.ptr, line+len, r.rlp); // 9
result = fast_from_chars(result.ptr, line+len, r.peak); // 10
result = fast_from_chars(result.ptr, line+len, r.corr); // 11
if (read_columns > 11) {
result = fast_from_chars(result.ptr, line+len, r.maxc); // 12
} else {
r.maxc = 0;
}
} else {
r.rlp = r.peak = r.corr = r.maxc = 0;
}
if (result.ec != std::errc())
fail("failed to parse data line:\n", line);
if (iset_col >= read_columns) {
const char* iset_ptr = result.ptr;
for (int j = read_columns+1; j < iset_col; ++j)
iset_ptr = skip_word(skip_blank(iset_ptr));
r.iset = simple_atoi(iset_ptr);
}
}
}
fail("incorrect or unfinished file: " + source_path);
}

inline XdsAscii read_xds_ascii_file(const std::string& path) {
auto f = file_open(path.c_str(), "r");
XdsAscii ret;
ret.read_stream(FileStream{f.get()}, path);
ret.read_stream(LineReader<FileStream>{f.get()}, path);
return ret;
}

Expand Down
Loading

0 comments on commit 4e08d3d

Please sign in to comment.