Skip to content

Commit

Permalink
Add parameter for results queue size to make_reader
Browse files Browse the repository at this point in the history
  • Loading branch information
Robbie Gruener authored and rgruener committed Dec 10, 2018
1 parent c4d6342 commit 11bd9e3
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions petastorm/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@

def make_reader(dataset_url,
schema_fields=None,
reader_pool_type='thread', workers_count=10, pyarrow_serialize=False,
reader_pool_type='thread', workers_count=10, pyarrow_serialize=False, results_queue_size=50,
shuffle_row_groups=True, shuffle_row_drop_partitions=1,
predicate=None,
rowgroup_selector=None,
Expand Down Expand Up @@ -82,6 +82,8 @@ def make_reader(dataset_url,
thread or process pool. Defaults to 10
:param pyarrow_serialize: Whether to use pyarrow for serialization. Currently only applicable to process pool.
Defaults to False.
:param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to
thread reader pool type.
:param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
:param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
break up a row group into for increased shuffling in exchange for worse performance (extra reads).
Expand Down Expand Up @@ -143,7 +145,7 @@ def make_reader(dataset_url,

if reader_engine == 'reader_v1':
if reader_pool_type == 'thread':
reader_pool = ThreadPool(workers_count)
reader_pool = ThreadPool(workers_count, results_queue_size)
elif reader_pool_type == 'process':
if pyarrow_serialize:
serializer = PyArrowSerializer()
Expand Down

0 comments on commit 11bd9e3

Please sign in to comment.