update biobear to allow lazyframe loading

TRISTAN-ORF · Jan 22, 2025 · 417324c · 417324c
1 parent 14592ce
commit 417324c
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 9 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -22,7 +22,7 @@ install_requires =
     numpy >= 1.26.4
     scipy >= 1.13.0
     pandas >= 2.2.2
-    biobear >= 0.20.2
+    biobear >= 0.23.7
     pyarrow >= 18.0.0
     h5max == 0.3.2
     fasta-reader == 3.0.2

diff --git a/transcript_transformer/data.py b/transcript_transformer/data.py
@@ -150,14 +150,8 @@ def process_ribo_data(
             s = f"CREATE EXTERNAL TABLE test STORED AS BAM LOCATION '{path}'"
             ctx = bb.connect()
             ctx.sql(s)
-            exe = ctx.sql("SELECT reference, start, sequence FROM test")
-            # Convert the list of RecordBatches to a Table
-            table = Table.from_batches(exe.to_arrow_record_batch_reader())
-            # Create a Dataset from the Table
-            ds = dataset(table)
-            # Lazyframe
-            lf = pl.scan_pyarrow_dataset(ds)
-
+            s_2 = "SELECT reference, start, sequence FROM test"
+            lf = ctx.sql(s_2).to_polars(lazy=True)
         else:
             raise TypeError(f"file extension {file_ext} not supported")
         new_columns = ["transcript_id", "pos", "read"]