Skip to content

Commit

Permalink
0.3.1 (#33)
Browse files Browse the repository at this point in the history
* update

* update

* update
  • Loading branch information
goodwanghan authored Sep 3, 2021
1 parent 5b80e0f commit 4aa3165
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 5 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ Fugue for Kaggle users

## Release History

### 0.3.2

* Use Fugue 0.6.2
* Make spark config smarter

### 0.3.1

* Use Fugue 0.6.0
Expand Down
16 changes: 13 additions & 3 deletions fuggle/execution_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Dict, Iterable, Optional, Tuple

import pandas as pd
import psutil
from fugue import (
DataFrame,
DataFrames,
Expand Down Expand Up @@ -86,8 +87,8 @@ def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = Non
configs = _process_confs(
{
"fugue.spark.use_pandas_udf": True,
"spark.driver.memory": "14g",
"spark.sql.shuffle.partitions": "16",
"spark.driver.memory": _get_optimal_mem(),
"spark.sql.shuffle.partitions": _get_optimal_partition(),
"spark.sql.execution.arrow.pyspark.fallback.enabled": True,
"spark.driver.extraJavaOptions": "-Dio.netty.tryReflectionSetAccessible=true", # noqa: E501
"spark.executor.extraJavaOptions": "-Dio.netty.tryReflectionSetAccessible=true", # noqa: E501
Expand All @@ -105,7 +106,7 @@ def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = Non
class KaggleDaskExecutionEngine(DaskExecutionEngine):
def __init__(self, conf: Any = None):
configs = _process_confs(
{FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS: 16},
{FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS: _get_optimal_partition()},
ParamDict(conf),
)
super().__init__(conf=configs)
Expand Down Expand Up @@ -173,3 +174,12 @@ def _process_conf(conf: Dict[str, Any]) -> Iterable[Tuple[str, Any]]:
yield "fugue.rpc.server", "fugue.rpc.base.NativeRPCServer"
else:
yield k, v


def _get_optimal_mem(ratio: float = 0.8) -> str:
mem = psutil.virtual_memory()
return str(int(mem.total * ratio / 1024 / 1024)) + "m"


def _get_optimal_partition() -> int:
return psutil.cpu_count() * 4
2 changes: 1 addition & 1 deletion fuggle_version/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.1"
__version__ = "0.3.2"
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@
keywords="fugue kaggle sql spark dask pandas",
url="http://github.com/fugue-project/fuggle",
install_requires=[
"fugue[spark,dask,sql]==0.6.0",
"fugue[spark,dask,sql]==0.6.2",
"tune[all]==0.0.6",
"notebook",
"kaggle",
"seaborn",
"qpd",
"dask[dataframe]",
"pandavro",
"psutil",
],
extras_require={
"dasksql": ["dask-sql!=0.3.4"],
Expand Down

0 comments on commit 4aa3165

Please sign in to comment.