diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 15a357a18..6617ae66d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -157,6 +157,54 @@ jobs: source pygraphistry/bin/activate ./bin/test-umap-learn-core.sh + + test-gpu-umap: # well cpu until get a github actions gpu node + + needs: [ test-minimal-python ] + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [3.8, 3.9] + + steps: + + - name: Checkout repo + uses: actions/checkout@v3 + with: + lfs: true + + - name: Checkout LFS objects + run: git lfs pull + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install test dependencies + run: | + python -m venv pygraphistry + source pygraphistry/bin/activate + python -m pip install --upgrade pip + python -m pip install -e .[test,testai,cu_cat] + + - name: Type check + run: | + source pygraphistry/bin/activate + ./bin/typecheck.sh + + - name: Core feature tests (weak featurize) + run: | + source pygraphistry/bin/activate + ./bin/test-features.sh + + - name: Core umap tests (weak featurize) + run: | + source pygraphistry/bin/activate + ./bin/test-umap-learn-core.sh + + test-full-ai: needs: [ test-minimal-python ] diff --git a/docker/test-gpu-local.sh b/docker/test-gpu-local.sh index 14d4c2779..8abd33003 100755 --- a/docker/test-gpu-local.sh +++ b/docker/test-gpu-local.sh @@ -47,5 +47,4 @@ docker run \ ${NETWORK} \ graphistry/test-gpu:${TEST_CPU_VERSION} \ --maxfail=1 \ - --ignore=graphistry/tests/test_feature_utils.py \ $@ diff --git a/graphistry/constants.py b/graphistry/constants.py index f6fda05fd..d74d9a81a 100644 --- a/graphistry/constants.py +++ b/graphistry/constants.py @@ -45,6 +45,7 @@ # for preprocessors namespace # for dirty_cat params DIRTY_CAT = "dirty_cat" +CUDA_CAT = "cu_cat" N_TOPICS_DEFAULT = 42 N_TOPICS_TARGET_DEFAULT = 7 N_HASHERS_DEFAULT = 100 diff --git a/graphistry/embed_utils.py b/graphistry/embed_utils.py index 81fc45fe8..6050de056 100644 --- a/graphistry/embed_utils.py +++ b/graphistry/embed_utils.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd from typing import Optional, Union, Callable, List, TYPE_CHECKING, Any, Tuple - +from inspect import getmodule from .PlotterBase import Plottable from .compute.ComputeMixin import ComputeMixin @@ -21,13 +21,7 @@ def lazy_embed_import_dep(): except: return False, None, None, None, None, None, None, None -def check_cudf(): - try: - import cudf - return True, cudf - except: - return False, object - + if TYPE_CHECKING: _, torch, _, _, _, _, _, _ = lazy_embed_import_dep() @@ -38,8 +32,6 @@ def check_cudf(): MIXIN_BASE = object torch = Any -has_cudf, cudf = check_cudf() - XSymbolic = Optional[Union[List[str], str, pd.DataFrame]] ProtoSymbolic = Optional[Union[str, Callable[[TT, TT, TT], TT]]] # type: ignore @@ -301,12 +293,12 @@ def embed( """ # this is temporary, will be fixed in future releases try: - if isinstance(self._nodes, cudf.DataFrame): + if 'cudf' in str(getmodule(self._nodes)): self._nodes = self._nodes.to_pandas() except: pass try: - if isinstance(self._edges, cudf.DataFrame): + if 'cudf' in str(getmodule(self._edges)): self._edges = self._edges.to_pandas() except: pass @@ -436,7 +428,7 @@ def predict_links( else: # this is temporary, will be removed after gpu feature utils try: - if isinstance(source, cudf.DataFrame): + if 'cudf' in str(getmodule(source)): source = source.to_pandas() # type: ignore except: pass @@ -448,7 +440,7 @@ def predict_links( else: # this is temporary, will be removed after gpu feature utils try: - if isinstance(relation, cudf.DataFrame): + if 'cudf' in str(getmodule(relation)): relation = relation.to_pandas() # type: ignore except: pass @@ -460,7 +452,8 @@ def predict_links( else: # this is temporary, will be removed after gpu feature utils try: - if isinstance(destination, cudf.DataFrame): + # if isinstance(destination, cudf.DataFrame): + if 'cudf' in str(getmodule(destination)): destination = destination.to_pandas() # type: ignore except: pass diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index 26214f3a6..02c1e5af9 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -20,6 +20,7 @@ from graphistry.compute.ComputeMixin import ComputeMixin from . import constants as config +from .constants import CUDA_CAT, DIRTY_CAT from .PlotterBase import WeakValueDictionary, Plottable from .util import setup_logger, check_set_memoize from .ai_utils import infer_graph, infer_self_graph @@ -41,12 +42,19 @@ from dirty_cat import ( SuperVectorizer, GapEncoder, - SimilarityEncoder, ) except: SuperVectorizer = Any GapEncoder = Any - SimilarityEncoder = Any + + try: + from cu_cat import ( + SuperVectorizer, + GapEncoder, + ) # type: ignore + except: + SuperVectorizer = Any + GapEncoder = Any try: from sklearn.preprocessing import FunctionTransformer from sklearn.base import BaseEstimator, TransformerMixin @@ -60,7 +68,6 @@ SentenceTransformer = Any SuperVectorizer = Any GapEncoder = Any - SimilarityEncoder = Any FunctionTransformer = Any BaseEstimator = Any TransformerMixin = Any @@ -89,15 +96,22 @@ def lazy_import_has_min_dependancy(): except ModuleNotFoundError as e: return False, e -def lazy_import_has_dirty_cat(): +def lazy_import_has_cudf_dependancy(): import warnings warnings.filterwarnings("ignore") try: - import dirty_cat - return True, 'ok', dirty_cat + from cu_cat import __version__ as cu_cat_version + from cuml import __version__ as cuml_version + from cudf import __version__ as cudf_version + import cudf + logger.debug(f"Cuda CAT VERSION: {cu_cat_version}") + logger.debug(f"cuml VERSION: {cuml_version}") + logger.debug(f"cudf VERSION: {cudf_version}") + return True, 'ok', cudf except ModuleNotFoundError as e: return False, e, None + def assert_imported_text(): has_dependancy_text_, import_text_exn, _ = lazy_import_has_dependancy_text() if not has_dependancy_text_: @@ -108,7 +122,7 @@ def assert_imported_text(): raise import_text_exn -def assert_imported(): +def assert_imported_min(): has_min_dependancy_, import_min_exn = lazy_import_has_min_dependancy() if not has_min_dependancy_: logger.error( # noqa @@ -118,6 +132,35 @@ def assert_imported(): raise import_min_exn +def assert_imported_cucat(): + has_dependancy_cudf_, import_exn, cudf = lazy_import_has_cudf_dependancy() + if not has_dependancy_cudf_: + logger.error( # noqa + "cuml not found, trying running" # noqa + "`pip install --extra-index-url=https://pypi.nvidia.com cuml-cu12 cudf-cu12`" # noqa + ) + raise import_exn + + +def make_safe_gpu_dataframes(X, y, engine): + has_dependancy_cudf_, _, cudf = lazy_import_has_cudf_dependancy() + + if has_dependancy_cudf_: + assert cudf is not None + new_kwargs = {} + kwargs = {'X': X, 'y': y} + for key, value in kwargs.items(): + if isinstance(value, cudf.DataFrame) and engine in ["pandas", "dirty_cat", "torch"]: + new_kwargs[key] = value.to_pandas() + elif isinstance(value, pd.DataFrame) and engine in ["cuml", "cu_cat", "cuda", "gpu"]: + new_kwargs[key] = cudf.from_pandas(value) + else: + new_kwargs[key] = value + return new_kwargs['X'], new_kwargs['y'] + else: + return X, y + + # ############################################################################ # # Rough calltree @@ -141,7 +184,7 @@ def assert_imported(): # # _featurize_or_get_edges_dataframe_if_X_is_None -FeatureEngineConcrete = Literal["none", "pandas", "dirty_cat", "torch"] +FeatureEngineConcrete = Literal["none", "pandas", "dirty_cat", "torch", "cu_cat"] FeatureEngine = Literal[FeatureEngineConcrete, "auto"] @@ -149,9 +192,8 @@ def resolve_feature_engine( feature_engine: FeatureEngine, ) -> FeatureEngineConcrete: # noqa - if feature_engine in ["none", "pandas", "dirty_cat", "torch"]: + if feature_engine in ["none", "pandas", "dirty_cat", "torch", "cu_cat"]: return feature_engine # type: ignore - if feature_engine == "auto": has_dependancy_text_, _, _ = lazy_import_has_dependancy_text() if has_dependancy_text_: @@ -159,11 +201,14 @@ def resolve_feature_engine( has_min_dependancy_, _ = lazy_import_has_min_dependancy() if has_min_dependancy_: return "dirty_cat" + has_dependancy_cudf_, _, _ = lazy_import_has_cudf_dependancy() + if has_dependancy_cudf_: + return "cu_cat" return "pandas" raise ValueError( # noqa f'feature_engine expected to be "none", ' - '"pandas", "dirty_cat", "torch", or "auto"' + '"pandas", "dirty_cat", "torch", "cu_cat", or "auto"' f'but received: {feature_engine} :: {type(feature_engine)}' ) @@ -173,7 +218,9 @@ def resolve_feature_engine( def resolve_y(df: Optional[pd.DataFrame], y: YSymbolic) -> pd.DataFrame: - if isinstance(y, pd.DataFrame) or 'cudf' in str(getmodule(y)): + _, _, cudf = lazy_import_has_cudf_dependancy() + + if isinstance(y, pd.DataFrame) or (cudf is not None and 'cudf' in str(getmodule(y))): return y # type: ignore if df is None: @@ -194,7 +241,9 @@ def resolve_y(df: Optional[pd.DataFrame], y: YSymbolic) -> pd.DataFrame: def resolve_X(df: Optional[pd.DataFrame], X: XSymbolic) -> pd.DataFrame: - if isinstance(X, pd.DataFrame) or 'cudf' in str(getmodule(X)): + _, _, cudf = lazy_import_has_cudf_dependancy() + + if isinstance(X, pd.DataFrame) or (cudf is not None and 'cudf' in str(getmodule(X))): return X # type: ignore if df is None: @@ -234,18 +283,19 @@ def features_without_target( :param y: target DataFrame :return: DataFrames of model and target """ + _, _, cudf = lazy_import_has_cudf_dependancy() if y is None: return df remove_cols = [] if y is None: pass - elif isinstance(y, pd.DataFrame): + elif isinstance(y, pd.DataFrame) or (cudf is not None and isinstance(y, cudf.DataFrame)): yc = y.columns xc = df.columns for c in yc: if c in xc: remove_cols.append(c) - elif isinstance(y, pd.Series): + elif isinstance(y, pd.Series) or (cudf is not None and isinstance(y, cudf.Series)): if y.name and (y.name in df.columns): remove_cols = [y.name] elif isinstance(y, List): @@ -264,12 +314,13 @@ def features_without_target( def remove_node_column_from_symbolic(X_symbolic, node): + _, _, cudf = lazy_import_has_cudf_dependancy() if isinstance(X_symbolic, list): if node in X_symbolic: logger.info(f"Removing `{node}` from input X_symbolic list") X_symbolic.remove(node) return X_symbolic - if isinstance(X_symbolic, pd.DataFrame): + if isinstance(X_symbolic, pd.DataFrame) or (cudf is not None and isinstance(X_symbolic, cudf.DataFrame)): logger.info(f"Removing `{node}` from input X_symbolic DataFrame") return X_symbolic.drop(columns=[node], errors="ignore") @@ -352,7 +403,19 @@ def set_to_numeric(df: pd.DataFrame, cols: List, fill_value: float = 0.0): def set_to_datetime(df: pd.DataFrame, cols: List, new_col: str): # eg df["Start_Date"] = pd.to_datetime(df[['Month', 'Day', 'Year']]) - df[new_col] = pd.to_datetime(df[cols], errors="coerce").fillna(0) + X_type = str(getmodule(df)) + if 'cudf' not in X_type: + df[new_col] = pd.to_datetime(df[cols], errors="coerce").fillna(0) + else: + _, _, cudf = lazy_import_has_cudf_dependancy() + assert cudf is not None + for col in df.columns: + try: + df[col] = cudf.to_datetime( + df[col], errors="raise", infer_datetime_format=True + ) + except: + pass def set_to_bool(df: pd.DataFrame, col: str, value: Any): @@ -630,11 +693,20 @@ def fit_pipeline( columns = X.columns index = X.index - X = transformer.fit_transform(X) - if keep_n_decimals: - X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa - - return pd.DataFrame(X, columns=columns, index=index) + X_type = str(getmodule(X)) + if 'cudf' not in X_type: + X = transformer.fit_transform(X) + if keep_n_decimals: + X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa + X = pd.DataFrame(X, columns=columns, index=index) + else: + X = transformer.fit_transform(X) + if keep_n_decimals: + X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa + _, _, cudf = lazy_import_has_cudf_dependancy() + assert cudf is not None + X = cudf.DataFrame(X, columns=columns, index=index) + return X def impute_and_scale_df( @@ -859,6 +931,7 @@ def process_dirty_dataframes( similarity: Optional[str] = None, # "ngram", categories: Optional[str] = "auto", multilabel: bool = False, + feature_engine: Optional[str] = "dirty_cat", ) -> Tuple[ pd.DataFrame, Optional[pd.DataFrame], @@ -884,23 +957,36 @@ def process_dirty_dataframes( :return: Encoded data matrix and target (if not None), the data encoder, and the label encoder. """ - has_dirty_cat, _, dirty_cat = lazy_import_has_dirty_cat() - if has_dirty_cat: - from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder - from sklearn.preprocessing import FunctionTransformer + + if feature_engine == "cu_cat": + assert_imported_cucat() + from cu_cat import SuperVectorizer, GapEncoder + from cuml.preprocessing import FunctionTransformer + + else: + from dirty_cat import SuperVectorizer, GapEncoder + from sklearn.preprocessing import FunctionTransformer + t = time() - all_numeric = is_dataframe_all_numeric(ndf) - if not all_numeric and has_dirty_cat: - data_encoder = SuperVectorizer( - auto_cast=True, - cardinality_threshold=cardinality_threshold, - high_card_cat_transformer=GapEncoder(n_topics), - # numerical_transformer=StandardScaler(), This breaks - # since -- AttributeError: Transformer numeric - # (type StandardScaler) - # does not provide get_feature_names. - ) + if not is_dataframe_all_numeric(ndf): + if feature_engine == "cu_cat": + data_encoder = SuperVectorizer( + auto_cast=True, + cardinality_threshold=cardinality_threshold_target, + high_card_cat_transformer=GapEncoder(n_topics), + datetime_transformer = "passthrough" + ) + else: + data_encoder = SuperVectorizer( + auto_cast=True, + cardinality_threshold=cardinality_threshold, + high_card_cat_transformer=GapEncoder(n_topics), + # numerical_transformer=StandardScaler(), This breaks + # since -- AttributeError: Transformer numeric + # (type StandardScaler) + # does not provide get_feature_names. + ) logger.info(":: Encoding DataFrame might take a few minutes ------") @@ -922,7 +1008,10 @@ def process_dirty_dataframes( features_transformed = data_encoder.get_feature_names_out() all_transformers = data_encoder.transformers - logger.debug(f"-Shape of [[dirty_cat fit]] data {X_enc.shape}") + if feature_engine == "cu_cat": + logger.info(f"-Shape of [[cu_cat fit]] data {X_enc.shape}") + else: + logger.info(f"-Shape of [[dirty_cat fit]] data {X_enc.shape}") logger.debug(f"-Transformers: \n{all_transformers}\n") logger.debug( f"-Transformed Columns: \n{features_transformed[:20]}...\n" @@ -932,16 +1021,27 @@ def process_dirty_dataframes( ) # now just set the feature names, since dirty cat changes them in # a weird way... - data_encoder.get_feature_names_out = callThrough(features_transformed) - - X_enc = pd.DataFrame( - X_enc, columns=features_transformed, index=ndf.index - ) - X_enc = X_enc.fillna(0.0) - elif all_numeric and not has_dirty_cat: - numeric_ndf = ndf.select_dtypes(include=[np.number]) # type: ignore - logger.warning("-*-*- DataFrame is not numeric and no dirty_cat, dropping non-numeric") - X_enc, _, data_encoder, _ = get_numeric_transformers(numeric_ndf, None) + data_encoder.get_feature_names_out = callThrough(features_transformed) + if 'cudf' not in str(getmodule(ndf)): + X_enc = pd.DataFrame( + X_enc, columns=features_transformed, index=ndf.index + ) + X_enc = X_enc.fillna(0.0) + elif 'cudf' in str(getmodule(ndf)) and 'cudf' not in str(getmodule(X_enc)): + _, _, cudf = lazy_import_has_cudf_dependancy() + try: + X_enc = cudf.DataFrame(X_enc) + except TypeError: + X_enc = cudf.DataFrame(X_enc.toarray()) # if sparse cupy array + # ndf = set_to_datetime(ndf,'A','A') + dt_count = ndf.select_dtypes(include=["datetime", "datetimetz"]).columns.to_list() + if len(dt_count) > 0: + dt_new = ['datetime_' + str(n) for n in range(len(dt_count))] + features_transformed.extend(dt_new) + X_enc.columns = features_transformed + X_enc.set_index(ndf.index) + X_enc = X_enc.fillna(0.0) + else: logger.debug("-*-*- DataFrame is completely numeric") X_enc, _, data_encoder, _ = get_numeric_transformers(ndf, None) @@ -958,15 +1058,23 @@ def process_dirty_dataframes( t2 = time() logger.debug("-Fitting Targets --\n%s", y.columns) - label_encoder = SuperVectorizer( - auto_cast=True, - cardinality_threshold=cardinality_threshold_target, - high_card_cat_transformer=GapEncoder(n_topics_target) - if not similarity - else SimilarityEncoder( - similarity=similarity, categories=categories, n_prototypes=2 - ), # Similarity - ) + if feature_engine == "cu_cat": + label_encoder = SuperVectorizer( + auto_cast=True, + cardinality_threshold=cardinality_threshold_target, + high_card_cat_transformer=GapEncoder(n_topics_target), + datetime_transformer = "passthrough" + ) + else: + label_encoder = SuperVectorizer( + auto_cast=True, + cardinality_threshold=cardinality_threshold_target, + high_card_cat_transformer=GapEncoder(n_topics_target) + # if not similarity + # else SimilarityEncoder( + # similarity=similarity, categories=categories, n_prototypes=2 + # ), # Similarity + ) y_enc = label_encoder.fit_transform(y) y_enc = make_array(y_enc) @@ -1152,7 +1260,8 @@ def process_nodes_dataframes( n_topics_target=n_topics_target, similarity=similarity, categories=categories, - multilabel=multilabel + multilabel=multilabel, + feature_engine=feature_engine, ) if embedding: @@ -1270,20 +1379,31 @@ def encode_edges(edf, src, dst, mlb, fit=False): """ # uses mlb with fit=T/F so we can use it in transform mode # to recreate edge feature concat definition + edf_type = str(getmodule(edf)) source = edf[src] destination = edf[dst] + source_dtype = str(getmodule(source)) logger.debug("Encoding Edges using MultiLabelBinarizer") - if fit: + if fit and 'cudf' not in source_dtype: T = mlb.fit_transform(zip(source, destination)) - else: + elif fit and 'cudf' in source_dtype: + T = mlb.fit_transform(zip(source.to_pandas(), destination.to_pandas())) + elif not fit and 'cudf' not in source_dtype: T = mlb.transform(zip(source, destination)) + elif not fit and 'cudf' in source_dtype: + T = mlb.transform(zip(source.to_pandas(), destination.to_pandas())) + T = 1.0 * T # coerce to float columns = [ str(k) for k in mlb.classes_ ] # stringify the column names or scikits.base throws error mlb.get_feature_names_out = callThrough(columns) mlb.columns_ = [src, dst] - T = pd.DataFrame(T, columns=columns, index=edf.index) + if 'cudf' in edf_type: + _, _, cudf = lazy_import_has_cudf_dependancy() + T = cudf.DataFrame(T, columns=columns, index=edf.index) + else: + T = pd.DataFrame(T, columns=columns, index=edf.index) logger.info(f"Shape of Edge Encoding: {T.shape}") return T, mlb @@ -1356,6 +1476,7 @@ def process_edge_dataframes( MultiLabelBinarizer() ) # create new one so we can use encode_edges later in # transform with fit=False + _, _, cudf = lazy_import_has_cudf_dependancy() T, mlb_pairwise_edge_encoder = encode_edges( edf, src, dst, mlb_pairwise_edge_encoder, fit=True ) @@ -1368,13 +1489,23 @@ def process_edge_dataframes( " and is empty" ) - if feature_engine in ["none", "pandas"]: + if feature_engine in ["none", "pandas", "cudf"]: X_enc, y_enc, data_encoder, label_encoder = get_numeric_transformers( other_df, y ) # add the two datasets together - X_enc = pd.concat([T, X_enc], axis=1) + has_dependancy_cudf_, import_exn, cudf = lazy_import_has_cudf_dependancy() + T_type = str(getmodule(T)) + X_type = str(getmodule(X_enc)) + if 'cudf' in T_type and 'cudf' in X_type: + X_enc = cudf.concat([T, X_enc], axis=1) + elif 'pd' in T_type and 'pd' in X_type: + X_enc = pd.concat([T, X_enc], axis=1) + elif 'cudf' in T_type and 'pd' in X_type: + X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1) + elif 'pd' in T_type and 'cudf' in X_type: + X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1) # then scale them X_encs, y_encs, scaling_pipeline, scaling_pipeline_target = smart_scaler( # noqa X_enc, @@ -1441,7 +1572,17 @@ def process_edge_dataframes( if not X_enc.empty and not T.empty: logger.debug("-" * 60) logger.debug("<= Found Edges and Dirty_cat encoding =>") - X_enc = pd.concat([T, X_enc], axis=1) + has_dependancy_cudf_, import_exn, cudf = lazy_import_has_cudf_dependancy() + T_type = str(getmodule(T)) + X_type = str(getmodule(X_enc)) + if 'cudf' in T_type and 'cudf' in X_type: + X_enc = cudf.concat([T, X_enc], axis=1) + elif 'pd' in T_type and 'pd' in X_type: + X_enc = pd.concat([T, X_enc], axis=1) + elif 'cudf' in T_type and 'pd' in X_type: + X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1) + elif 'pd' in T_type and 'cudf' in X_type: + X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1) elif not T.empty and X_enc.empty: logger.debug("-" * 60) logger.debug("<= Found only Edges =>") @@ -1621,7 +1762,18 @@ def transform( # concat text to dirty_cat, with text in front. if not tX.empty and not X.empty: - X = pd.concat([tX, X], axis=1) + has_dependancy_cudf_, import_exn, cudf = lazy_import_has_cudf_dependancy() + T_type = str(getmodule(tX)) + X_type = str(getmodule(X)) + if 'cudf' in T_type and 'cudf' in X_type: + X = cudf.concat([tX, X], axis=1) + elif 'pd' in T_type and 'pd' in X_type: + X = pd.concat([tX, X], axis=1) + elif 'cudf' in T_type and 'pd' in X_type: + X = cudf.concat([cudf.from_pandas(tX), X], axis=1) + elif 'pd' in T_type and 'cudf' in X_type: + X = cudf.concat([tX, cudf.from_pandas(X)], axis=1) + # X = pd.concat([tX, X], axis=1) logger.info("--Combining both Textual and Numeric/Dirty_Cat") elif not tX.empty and X.empty: X = tX # textual @@ -1636,7 +1788,18 @@ def transform( # now if edges, add T at front if kind == "edges": - X = pd.concat([T, X], axis=1) # edges, text, dirty_cat + # X = pd.concat([T, X], axis=1) # edges, text, dirty_cat + has_dependancy_cudf_, import_exn, cudf = lazy_import_has_cudf_dependancy() + T_type = str(getmodule(T)) + X_type = str(getmodule(X)) + if 'cudf' in T_type and 'cudf' in X_type: + X = cudf.concat([T, X], axis=1) + elif 'pd' in T_type and 'pd' in X_type: + X = pd.concat([T, X], axis=1) + elif 'cudf' in T_type and 'pd' in X_type: + X = cudf.concat([cudf.from_pandas(T), X], axis=1) + elif 'pd' in T_type and 'cudf' in X_type: + X = cudf.concat([T, cudf.from_pandas(X)], axis=1) logger.info("-Combining MultiLabelBinarizer with previous features") logger.info("-" * 40) @@ -1846,7 +2009,7 @@ def prune_weighted_edges_df_and_relabel_nodes( " -- Pruning weighted edge DataFrame " f"from {len(wdf):,} to {len(wdf2):,} edges." ) - if index_to_nodes_dict is not None: + if index_to_nodes_dict is not None and isinstance(index_to_nodes_dict, dict): wdf2[config.SRC] = wdf2[config.SRC].map(index_to_nodes_dict) wdf2[config.DST] = wdf2[config.DST].map(index_to_nodes_dict) return wdf2 @@ -1963,7 +2126,7 @@ def _featurize_nodes( res = self.copy() ndf = res._nodes node = res._node - + if remove_node_column: ndf = remove_node_column_from_symbolic(ndf, node) X = remove_node_column_from_symbolic(X, node) @@ -1987,7 +2150,7 @@ def _featurize_nodes( X_resolved = resolve_X(ndf, X) y_resolved = resolve_y(ndf, y) - feature_engine = resolve_feature_engine(feature_engine) + X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine) from .features import ModelDict @@ -2111,6 +2274,7 @@ def _featurize_edges( X_resolved = X_resolved.assign( **{res._destination: res._edges[res._destination]} ) + X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine) # now that everything is set fkwargs = dict( @@ -2416,6 +2580,7 @@ def featurize( remove_node_column: bool = True, inplace: bool = False, feature_engine: FeatureEngine = "auto", + engine: FeatureEngine = "auto", dbscan: bool = False, min_dist: float = 0.5, # DBSCAN eps min_samples: int = 1, # DBSCAN min_samples @@ -2523,13 +2688,19 @@ def featurize( default True. :return: graphistry instance with new attributes set by the featurization process. """ - assert_imported() + feature_engine = resolve_feature_engine(feature_engine) + + + if feature_engine == "cu_cat": + assert_imported_cucat() + else: + assert_imported_min() + if inplace: res = self else: res = self.bind() - feature_engine = resolve_feature_engine(feature_engine) if kind == "nodes": res = res._featurize_nodes( diff --git a/graphistry/tests/test_embed_utils.py b/graphistry/tests/test_embed_utils.py index 307bdd026..4f642c385 100644 --- a/graphistry/tests/test_embed_utils.py +++ b/graphistry/tests/test_embed_utils.py @@ -5,13 +5,15 @@ import graphistry import numpy as np -from graphistry.embed_utils import lazy_embed_import_dep, check_cudf - +from graphistry.embed_utils import lazy_embed_import_dep # , check_cudf +from graphistry.umap_utils import lazy_cudf_import_has_dependancy import logging logger = logging.getLogger(__name__) dep_flag, _, _, _, _, _, _, _ = lazy_embed_import_dep() -has_cudf, cudf = check_cudf() +# has_cudf, cudf = check_cudf() + +has_cudf, _, cudf = lazy_cudf_import_has_dependancy() # enable tests if has cudf and env didn't explicitly disable is_test_cudf = has_cudf and os.environ["TEST_CUDF"] != "0" diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py index fa4333737..49b5181c9 100644 --- a/graphistry/tests/test_feature_utils.py +++ b/graphistry/tests/test_feature_utils.py @@ -1,4 +1,5 @@ # python -m unittest +import os import datetime as dt import graphistry import logging @@ -16,6 +17,8 @@ resolve_feature_engine, lazy_import_has_min_dependancy, lazy_import_has_dependancy_text, + lazy_import_has_cudf_dependancy, + set_to_datetime, FastEncoder ) @@ -26,6 +29,10 @@ has_min_dependancy, _ = lazy_import_has_min_dependancy() has_min_dependancy_text, _, _ = lazy_import_has_dependancy_text() +has_cudf, _, _ = lazy_import_has_cudf_dependancy() + +# enable tests if has cudf and env didn't explicitly disable +is_test_cudf = has_cudf and os.environ["TEST_CUDF"] != "0" logger = logging.getLogger(__name__) warnings.filterwarnings("ignore") @@ -33,7 +40,7 @@ model_avg_name = ( "/models/average_word_embeddings_komninos" # 250mb, fastest vectorizer in transformer models - #"/models/paraphrase-albert-small-v2" # 40mb + # "/models/paraphrase-albert-small-v2" # 40mb #"/models/paraphrase-MiniLM-L3-v2" # 60mb ) @@ -179,12 +186,12 @@ class TestFeaturizeGetMethods(unittest.TestCase): @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") def setUp(self) -> None: g = graphistry.nodes(ndf_reddit) - g2 = g.featurize(y=double_target_reddit, # ngrams + g2 = g.featurize(y=double_target_reddit, feature_engine=resolve_feature_engine('auto'), # ngrams use_ngrams=True, ngram_range=(1, 4) ) - g3 = g.featurize(**topic_model # topic model + g3 = g.featurize(**topic_model, feature_engine=resolve_feature_engine('auto') # topic model ) self.g = g self.g2 = g2 @@ -269,12 +276,12 @@ def cases_tests(self, x, y, data_encoder, target_encoder, name, value): ) self.assertIsInstance( data_encoder, - dirty_cat.super_vectorizer.SuperVectorizer, + dirty_cat.table_vectorizer.TableVectorizer, f"Data Encoder is not a dirty_cat.super_vectorizer.SuperVectorizer instance for {name} {value}", ) self.assertIsInstance( target_encoder, - dirty_cat.super_vectorizer.SuperVectorizer, + dirty_cat.table_vectorizer.TableVectorizer, f"Data Target Encoder is not a dirty_cat.super_vectorizer.SuperVectorizer instance for {name} {value}", ) @@ -306,7 +313,7 @@ def test_multi_label_binarizer(self): g = graphistry.nodes(bad_df) # can take in a list of lists and convert to multiOutput with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) - g2 = g.featurize(y=['list_str'], X=['src'], multilabel=True) + g2 = g.featurize(y=['list_str'], X=['src'], multilabel=True,feature_engine=resolve_feature_engine('auto')) y = g2._get_target('node') assert y.shape == (4, 4) assert sum(y.sum(1).values - np.array([1., 2., 1., 0.])) == 0 @@ -348,7 +355,6 @@ def cases_test_graph(self, g, name, value, kind="nodes", df=ndf_reddit): else: ndf = g._edges self.cases_check_edge_attributes(g) - cols = ndf.columns self.assertTrue( np.all(ndf.fillna(0) == df[cols].fillna(0)), @@ -379,6 +385,7 @@ def _test_featurizations(self, g, use_cols, targets, name, kind, df): use_scaler=None, use_scaler_target=None, use_ngrams=use_ngram, + feature_engine=resolve_feature_engine('auto'), min_df=0.0, max_df=1.0, cardinality_threshold=cardinality, @@ -420,7 +427,7 @@ def test_edge_featurization(self): @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") def test_node_scaling(self): g = graphistry.nodes(ndf_reddit) - g2 = g.featurize(X="title", y='label', use_scaler=None, use_scaler_target=None) + g2 = g.featurize(X="title", y='label', use_scaler=None, use_scaler_target=None,feature_engine=resolve_feature_engine('auto')) for scaler in SCALERS: X, y, c, d = g2.scale(ndf_reddit, single_target_reddit, kind='nodes', use_scaler=scaler, @@ -430,7 +437,7 @@ def test_node_scaling(self): @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") def test_edge_scaling(self): g = graphistry.edges(edge_df2, "src", "dst") - g2 = g.featurize(y='label', kind='edges', use_scaler=None, use_scaler_target=None) + g2 = g.featurize(y='label', kind='edges', use_scaler=None, use_scaler_target=None,feature_engine=resolve_feature_engine('auto')) for scaler in SCALERS: X, y, c, d = g2.scale(edge_df2, edge2_target_df, kind='edges', use_scaler=scaler, @@ -438,6 +445,5 @@ def test_edge_scaling(self): return_scalers=True) - if __name__ == "__main__": unittest.main() diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py index d2561739d..cb0bdfaf3 100644 --- a/graphistry/umap_utils.py +++ b/graphistry/umap_utils.py @@ -128,7 +128,7 @@ def safe_cudf(X, y): for key, value in kwargs.items(): if isinstance(value, cudf.DataFrame) and engine in ["pandas", "umap_learn", "dirty_cat"]: new_kwargs[key] = value.to_pandas() - elif isinstance(value, pd.DataFrame) and engine in ["cuml", "cu_cat"]: + elif isinstance(value, pd.DataFrame) and engine in ["cuml", "cu_cat", "cuda"]: new_kwargs[key] = cudf.from_pandas(value) else: new_kwargs[key] = value @@ -352,9 +352,9 @@ def transform_umap(self, df: pd.DataFrame, def _bundle_embedding(self, emb, index): # Converts Embedding into dataframe and takes care if emb.dim > 2 - if emb.shape[1] == 2 and 'cudf.core.dataframe' not in str(getmodule(emb)) and not hasattr(emb, 'device'): + if emb.shape[1] == 2 and 'cudf' not in str(getmodule(emb)) and not hasattr(emb, 'device'): emb = pd.DataFrame(emb, columns=[config.X, config.Y], index=index) - elif emb.shape[1] == 2 and 'cudf.core.dataframe' in str(getmodule(emb)): + elif emb.shape[1] == 2 and 'cudf' in str(getmodule(emb)): emb.rename(columns={0: config.X, 1: config.Y}, inplace=True) elif emb.shape[1] == 2 and hasattr(emb, 'device'): import cudf @@ -363,9 +363,9 @@ def _bundle_embedding(self, emb, index): columns = [config.X, config.Y] + [ f"umap_{k}" for k in range(2, emb.shape[1]) ] - if 'cudf.core.dataframe' not in str(getmodule(emb)): + if 'cudf' not in str(getmodule(emb)): emb = pd.DataFrame(emb, columns=columns, index=index) - elif 'cudf.core.dataframe' in str(getmodule(emb)): + elif 'cudf' in str(getmodule(emb)): emb.columns = columns return emb @@ -412,8 +412,15 @@ def _process_umap( print('** Fitting UMAP') if verbose else None res = res.umap_lazy_init(res, verbose=verbose, **umap_kwargs_pure) + self.datetime_columns = X_.select_dtypes( + include=["datetime", "datetimetz"] + ).columns.to_list() + + self.R_ = X_[self.datetime_columns] + X_ = X_.drop(columns=self.datetime_columns) + emb = res._umap_fit_transform(X_, y_, verbose=verbose) - res._xy = emb + res._xy = emb.join(self.R_) return res def _set_features( # noqa: E303 @@ -614,7 +621,7 @@ def umap( logger.debug("data is type :: %s", (type(X_))) if isinstance(X_, pd.DataFrame): index_to_nodes_dict = dict(zip(range(len(nodes)), nodes)) - elif 'cudf.core.dataframe' in str(getmodule(X_)): + elif 'cudf' in str(getmodule(X_)): index_to_nodes_dict = nodes # {}? # add the safe coercion here @@ -721,12 +728,19 @@ def _bind_xy_from_umap( emb = res._edge_embedding if isinstance(df, type(emb)): - df[x_name] = emb.values.T[0] - df[y_name] = emb.values.T[1] - elif isinstance(df, pd.DataFrame) and 'cudf.core.dataframe' in str(getmodule(emb)): - df[x_name] = emb.to_numpy().T[0] - df[y_name] = emb.to_numpy().T[1] - + try: + df[x_name] = emb.values.T[0] + df[y_name] = emb.values.T[1] + except ValueError: + df[x_name] = emb.values[0] + df[y_name] = emb.values[1] + elif isinstance(df, pd.DataFrame) and 'cudf' in str(getmodule(emb)): + try: + df[x_name] = emb.to_numpy().T[0] + df[y_name] = emb.to_numpy().T[1] + except ValueError: + df[x_name] = emb.to_numpy()[0] + df[y_name] = emb.to_numpy()[1] res = res.nodes(df) if kind == "nodes" else res.edges(df) if encode_weight and kind == "nodes": diff --git a/mypy.ini b/mypy.ini index 898e00114..5b4403e91 100644 --- a/mypy.ini +++ b/mypy.ini @@ -94,3 +94,6 @@ ignore_missing_imports = True [mypy-cuml.*] ignore_missing_imports = True + +[mypy-cu_cat.*] +ignore_missing_imports = true diff --git a/setup.py b/setup.py index c81db1b09..bfe0f5a21 100755 --- a/setup.py +++ b/setup.py @@ -44,10 +44,11 @@ def unique_flatten_dict(d): } base_extras_heavy = { - 'umap-learn': ['umap-learn', 'dirty-cat==0.2.0', 'scikit-learn>=1.0'], + 'umap-learn': ['umap-learn', 'dirty-cat', 'scikit-learn>=1.0'], } # https://github.com/facebookresearch/faiss/issues/1589 for faiss-cpu 1.6.1, #'setuptools==67.4.0' removed base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch<2', 'sentence-transformers', 'faiss-cpu', 'joblib'] +base_extras_heavy['cu_cat'] = ['cu_cat'] #>=0.7.32'] # requires: 'cuml>=23.02', 'cudf>=23.03', 'cupy>=11.0'] # setup requires GPU now, prev versions' tests fell back to cu_cat with cpu... base_extras = {**base_extras_light, **base_extras_heavy}