diff --git a/python/README.md b/python/README.md index 80673c368..2a46f06a7 100644 --- a/python/README.md +++ b/python/README.md @@ -43,7 +43,7 @@ import pyarrow f = ballista.functions # create a context -ctx = ballista.SessionContext() +ctx = ballista.BallistaContext("localhost", 50050) # create a RecordBatch and a new DataFrame from it batch = pyarrow.RecordBatch.from_arrays( @@ -65,6 +65,14 @@ assert result.column(0) == pyarrow.array([5, 7, 9]) assert result.column(1) == pyarrow.array([-3, -3, -3]) ``` +### Specifying Configuration Options + +Configuration settings can be specified when creating the context. + +```python +ctx = ballista.BallistaContext("localhost", 50050, shuffle_partitions = 200, batch_size = 16384) +``` + ### UDFs ```python diff --git a/python/src/ballista_context.rs b/python/src/ballista_context.rs index 40e389e78..956d2e359 100644 --- a/python/src/ballista_context.rs +++ b/python/src/ballista_context.rs @@ -38,10 +38,20 @@ pub(crate) struct PyBallistaContext { #[pymethods] impl PyBallistaContext { #[new] - #[args(port = "50050")] - fn new(py: Python, host: &str, port: u16) -> PyResult { + #[args(port = "50050", shuffle_partitions = 4, batch_size = 8192)] + fn new( + py: Python, + host: &str, + port: u16, + shuffle_partitions: usize, + batch_size: usize, + ) -> PyResult { let config = BallistaConfig::builder() - .set("ballista.shuffle.partitions", "4") + .set( + "ballista.shuffle.partitions", + &format!("{}", shuffle_partitions), + ) + .set("ballista.batch.size", &format!("{}", batch_size)) .set("ballista.with_information_schema", "true") .build() .map_err(BallistaError::from)?;