From 18fa40be09d0543c3849ef8e9f5b6a6444738539 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Sun, 14 Apr 2024 00:54:07 +0200
Subject: [PATCH 01/12] Implement FAISS GPU support

---
 scHPL/faissKNeighbors.py |  6 +++++-
 scHPL/learn.py           | 26 +++++++++++++++++---------
 scHPL/train.py           | 18 +++++++++++-------
 scHPL/utils.py           |  2 +-
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/scHPL/faissKNeighbors.py b/scHPL/faissKNeighbors.py
index 6273abe..a73c3a5 100644
--- a/scHPL/faissKNeighbors.py
+++ b/scHPL/faissKNeighbors.py
@@ -9,13 +9,17 @@
 import numpy as np
 
 class FaissKNeighbors:
-    def __init__(self, k=50):
+    def __init__(self, k=50, gpu=None):
         self.index = None
         self.y = None
         self.k = k
+        self.gpu = gpu
 
     def fit(self, X, y):
         self.index = faiss.IndexFlatL2(X.shape[1])
+        if self.gpu is not None:
+            res = faiss.StandardGpuResources()
+            self.index = faiss.index_cpu_to_gpu(res, self.gpu, self.index)
         self.index.add(X.astype(np.float32))
         self.y = y
 
diff --git a/scHPL/learn.py b/scHPL/learn.py
index 1bef064..0ddf6d6 100644
--- a/scHPL/learn.py
+++ b/scHPL/learn.py
@@ -9,7 +9,7 @@
 from anndata import AnnData
 
 from .train import train_tree
-from .utils import TreeNode, create_tree, print_tree
+from .utils import TreeNode, create_tree, print_tree as print_tree_func
 from .predict import predict_labels
 from .update import update_tree
 # from train import train_tree
@@ -36,11 +36,13 @@ def learn_tree(data: AnnData,
                distkNN: int = 99,
                dimred: bool = False,
                useRE: bool = True,
-               FN: float = 0.5,               
+               FN: float = 0.5,
                rej_threshold: float = 0.5,
                match_threshold: float = 0.25,
                attach_missing: bool = False,
-               print_conf: bool = False
+               print_conf: bool = False,
+               print_tree: bool = True,
+               gpu: int | None = None
 ):
     
     '''Learn a classification tree based on multiple labeled datasets.
@@ -93,6 +95,10 @@ def learn_tree(data: AnnData,
             If 'True' missing nodes are attached to the root node.
         print_conf: Boolean = False
             Whether to print the confusion matrices during the matching step.
+        print_tree: Boolean = True
+            Whether to print the tree during the training.
+        gpu: int = None
+            GPU index to use for the Faiss library (only used when classifier='knn')
             
         Returns
         -------
@@ -120,8 +126,9 @@ def learn_tree(data: AnnData,
     labels_1 = labels[idx_1]
     data_1 = xx[idx_1]
     
-    print('Starting tree:')
-    print_tree(tree)
+    if print_tree:
+        print('Starting tree:')
+        print_tree_func(tree)
     
     for b in batch_order:
         
@@ -137,13 +144,13 @@ def learn_tree(data: AnnData,
         if retrain:
             tree = train_tree(data_1, labels_1, tree, classifier, 
                               dimred, useRE, FN, n_neighbors, dynamic_neighbors,
-                              distkNN)
+                              distkNN, gpu=gpu)
         else:
             retrain = True 
         
         tree_2 = train_tree(data_2, labels_2, tree_2, classifier, 
                             dimred, useRE, FN, n_neighbors, dynamic_neighbors,
-                            distkNN)
+                            distkNN, gpu=gpu)
         
         # Predict labels other dataset
         labels_2_pred,_ = predict_labels(data_2, tree, threshold=rej_threshold)
@@ -160,8 +167,9 @@ def learn_tree(data: AnnData,
         
         missing_pop.extend(mis_pop)
         
-        print('\nUpdated tree:')
-        print_tree(tree, np.unique(labels_2))
+        if print_tree:
+            print('\nUpdated tree:')
+            print_tree_func(tree, np.unique(labels_2))
         
         #concatenate the two datasets
         data_1 = np.concatenate((data_1, data_2), axis = 0)
diff --git a/scHPL/train.py b/scHPL/train.py
index 40ef7ff..a43970a 100644
--- a/scHPL/train.py
+++ b/scHPL/train.py
@@ -34,7 +34,8 @@ def train_tree(data,
                FN: float = 0.5, 
                n_neighbors: int = 50,
                dynamic_neighbors: bool = True,
-               distkNN: int = 99):
+               distkNN: int = 99,
+               gpu=None):
     '''Train a hierarchical classifier. 
     
         Parameters
@@ -66,6 +67,8 @@ def train_tree(data,
             cell and it's closest neighbor of the training set. Threshold is 
             set to the distkNN's percentile of distances within the training
             set
+        gpu: int | None = None
+            GPU index to use for the Faiss library (only used when classifier='knn')
 
         
         Returns
@@ -129,7 +132,7 @@ def train_tree(data,
         except:
             None
         _,_ = _train_parentnode(data, labels_train, tree[0], n_neighbors, 
-                                dynamic_neighbors, distkNN)
+                                dynamic_neighbors, distkNN, gpu=gpu)
     else:
         for n in tree[0].descendants:
             _ = _train_node(data, labels, n, classifier, dimred, numgenes)
@@ -175,7 +178,7 @@ def _train_node(data, labels, n, classifier, dimred, numgenes):
         
     return group
 
-def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN):
+def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None):
     '''Train a knn classifier. In contrast to the linear svm and oc svm, this 
         is trained for each parent node instead of each child node
         
@@ -187,6 +190,7 @@ def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN):
         classifier: which classifier to use
         dimred: dimensionality reduction
         numgenes: number of genes in the training data
+        gpu: GPU index to use for the Faiss library (only used when classifier='knn')
         
         Return
         ------
@@ -203,7 +207,7 @@ def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN):
         for j in n.descendants:
             group_new, labels_new = _train_parentnode(data, labels, j, 
                                                       n_neighbors, dynamic_neighbors,
-                                                      distkNN)
+                                                      distkNN, gpu=gpu)
             group[np.where(group_new == 1)[0]] = 1
             labels[np.where(group_new == 1)[0]] = labels_new[np.where(group_new == 1)[0]]
         if n.name != None:
@@ -211,7 +215,7 @@ def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN):
             if len(n.descendants) == 1:
                 group[np.squeeze(np.isin(labels, n.name))] = 1
             # train_knn 
-            _train_knn(data,labels,group,n,n_neighbors,dynamic_neighbors,distkNN)
+            _train_knn(data,labels,group,n,n_neighbors,dynamic_neighbors,distkNN,gpu=gpu)
             # rename all group == 1 to node.name
             group[np.squeeze(np.isin(labels, n.name))] = 1
             labels[group==1] = n.name[0]
@@ -271,7 +275,7 @@ def _train_svm(data, labels, group, n):
     n.set_classifier(clf) #save classifier to the node
     
 
-def _train_knn(data, labels, group, n, n_neighbors, dynamic_neighbors, distkNN):
+def _train_knn(data, labels, group, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None):
     '''Train a linear svm and attach to the node
     
         Parameters:
@@ -300,7 +304,7 @@ def _train_knn(data, labels, group, n, n_neighbors, dynamic_neighbors, distkNN):
     try:
         import faiss
         from .faissKNeighbors import FaissKNeighbors 
-        clf = FaissKNeighbors(k=k)
+        clf = FaissKNeighbors(k=k, gpu=gpu)
         clf.fit(data_knn, labels_knn)
         #print('Using FAISS library')
 
diff --git a/scHPL/utils.py b/scHPL/utils.py
index eecd3b0..cf9f3d1 100644
--- a/scHPL/utils.py
+++ b/scHPL/utils.py
@@ -53,7 +53,7 @@ def set_classifier(self, classifier):
         """
         Add a classifier to the node.
         """
-        self.classifier = copy.deepcopy(classifier)
+        self.classifier = classifier
     
     def get_classifier(self):
         return self.classifier

From a59e895fee5c80da04a56a0bf176e90d402f6885 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Sun, 14 Apr 2024 11:02:14 +0200
Subject: [PATCH 02/12] Version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7205552..40895a3 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 
 setuptools.setup(
     name="scHPL", 
-    version="1.0.3",
+    version="1.0.4",
     author="Lieke Michielsen",
     author_email="l.c.m.michielsen@tudelft.nl",
     description="Hierarchical progressive learning pipeline for single-cell RNA-sequencing datasets",

From 1dc16b7288ba9503ff088f903594fb3e65845048 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Sun, 14 Apr 2024 11:18:01 +0200
Subject: [PATCH 03/12] Fix gpu parameter typing bug

---
 scHPL/learn.py | 6 +++---
 scHPL/train.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/scHPL/learn.py b/scHPL/learn.py
index 0ddf6d6..cb9042b 100644
--- a/scHPL/learn.py
+++ b/scHPL/learn.py
@@ -18,9 +18,9 @@
 # from update import update_tree
 
 try:
-    from typing import Literal
+    from typing import Literal, Optional
 except ImportError:
-    from typing_extensions import Literal
+    from typing_extensions import Literal, Optional
 
 
 def learn_tree(data: AnnData,
@@ -42,7 +42,7 @@ def learn_tree(data: AnnData,
                attach_missing: bool = False,
                print_conf: bool = False,
                print_tree: bool = True,
-               gpu: int | None = None
+               gpu: Optional[int] = None
 ):
     
     '''Learn a classification tree based on multiple labeled datasets.
diff --git a/scHPL/train.py b/scHPL/train.py
index a43970a..a8746a1 100644
--- a/scHPL/train.py
+++ b/scHPL/train.py
@@ -19,9 +19,9 @@
 import copy as cp
 
 try:
-    from typing import Literal
+    from typing import Literal, Optional
 except ImportError:
-    from typing_extensions import Literal
+    from typing_extensions import Literal, Optional
 
 
 @ignore_warnings(category=ConvergenceWarning)
@@ -35,7 +35,7 @@ def train_tree(data,
                n_neighbors: int = 50,
                dynamic_neighbors: bool = True,
                distkNN: int = 99,
-               gpu=None):
+               gpu: Optional[int] = None):
     '''Train a hierarchical classifier. 
     
         Parameters

From e75ff9f7cb5d084543d2bd061d817e6cd047cb18 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Sun, 14 Apr 2024 14:21:18 +0200
Subject: [PATCH 04/12] Loosen pandas dependency

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 40895a3..8f0328b 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@
         "numpy>=1.19.2",
         "scipy>=1.5.2",
         "scikit-learn>=0.23.2",
-        "pandas>=1.1.2,<2.0.0",
+        "pandas>=1.1.2,<=2.2.2",
         "newick~=1.0.0",
         "anndata>=0.7.4",
         "matplotlib>=3.3.1",

From 64a2f6a508382b865d1d0e8da92a5f758d331eed Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Tue, 16 Apr 2024 08:22:22 +0200
Subject: [PATCH 05/12] Add option for compressed FAISS index

---
 scHPL/faissKNeighbors.py |  8 +++++++-
 scHPL/learn.py           |  9 ++++++---
 scHPL/train.py           | 19 ++++++++++++-------
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/scHPL/faissKNeighbors.py b/scHPL/faissKNeighbors.py
index a73c3a5..dd6a9d6 100644
--- a/scHPL/faissKNeighbors.py
+++ b/scHPL/faissKNeighbors.py
@@ -9,17 +9,23 @@
 import numpy as np
 
 class FaissKNeighbors:
-    def __init__(self, k=50, gpu=None):
+    def __init__(self, k=50, gpu=None, compress=False):
         self.index = None
         self.y = None
         self.k = k
         self.gpu = gpu
+        self.compress = compress
 
     def fit(self, X, y):
         self.index = faiss.IndexFlatL2(X.shape[1])
+
+        if self.compress:
+            self.index = faiss.IndexIVFPQ(self.index, X.shape[1], 100, 16, 8)
+
         if self.gpu is not None:
             res = faiss.StandardGpuResources()
             self.index = faiss.index_cpu_to_gpu(res, self.gpu, self.index)
+
         self.index.add(X.astype(np.float32))
         self.y = y
 
diff --git a/scHPL/learn.py b/scHPL/learn.py
index cb9042b..967f2d2 100644
--- a/scHPL/learn.py
+++ b/scHPL/learn.py
@@ -42,7 +42,8 @@ def learn_tree(data: AnnData,
                attach_missing: bool = False,
                print_conf: bool = False,
                print_tree: bool = True,
-               gpu: Optional[int] = None
+               gpu: Optional[int] = None,
+               compress: bool = False
 ):
     
     '''Learn a classification tree based on multiple labeled datasets.
@@ -99,6 +100,8 @@ def learn_tree(data: AnnData,
             Whether to print the tree during the training.
         gpu: int = None
             GPU index to use for the Faiss library (only used when classifier='knn')
+        compress: Boolean = False
+            If 'True', the Faiss index is compressed (only used when classifier='knn')
             
         Returns
         -------
@@ -144,13 +147,13 @@ def learn_tree(data: AnnData,
         if retrain:
             tree = train_tree(data_1, labels_1, tree, classifier, 
                               dimred, useRE, FN, n_neighbors, dynamic_neighbors,
-                              distkNN, gpu=gpu)
+                              distkNN, gpu=gpu, compress=compress)
         else:
             retrain = True 
         
         tree_2 = train_tree(data_2, labels_2, tree_2, classifier, 
                             dimred, useRE, FN, n_neighbors, dynamic_neighbors,
-                            distkNN, gpu=gpu)
+                            distkNN, gpu=gpu, compress=compress)
         
         # Predict labels other dataset
         labels_2_pred,_ = predict_labels(data_2, tree, threshold=rej_threshold)
diff --git a/scHPL/train.py b/scHPL/train.py
index a8746a1..594c9f0 100644
--- a/scHPL/train.py
+++ b/scHPL/train.py
@@ -35,7 +35,8 @@ def train_tree(data,
                n_neighbors: int = 50,
                dynamic_neighbors: bool = True,
                distkNN: int = 99,
-               gpu: Optional[int] = None):
+               gpu: Optional[int] = None,
+               compress: bool = False):
     '''Train a hierarchical classifier. 
     
         Parameters
@@ -69,6 +70,9 @@ def train_tree(data,
             set
         gpu: int | None = None
             GPU index to use for the Faiss library (only used when classifier='knn')
+        compress: bool = False
+            If 'True', the Faiss library will use a compressed index for the kNN 
+            classifier.
 
         
         Returns
@@ -132,7 +136,7 @@ def train_tree(data,
         except:
             None
         _,_ = _train_parentnode(data, labels_train, tree[0], n_neighbors, 
-                                dynamic_neighbors, distkNN, gpu=gpu)
+                                dynamic_neighbors, distkNN, gpu=gpu, compress=compress)
     else:
         for n in tree[0].descendants:
             _ = _train_node(data, labels, n, classifier, dimred, numgenes)
@@ -178,7 +182,7 @@ def _train_node(data, labels, n, classifier, dimred, numgenes):
         
     return group
 
-def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None):
+def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None, compress=False):
     '''Train a knn classifier. In contrast to the linear svm and oc svm, this 
         is trained for each parent node instead of each child node
         
@@ -191,6 +195,7 @@ def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN,
         dimred: dimensionality reduction
         numgenes: number of genes in the training data
         gpu: GPU index to use for the Faiss library (only used when classifier='knn')
+        compress: If 'True', the Faiss library will use a compressed index for the kNN classifier.
         
         Return
         ------
@@ -207,7 +212,7 @@ def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN,
         for j in n.descendants:
             group_new, labels_new = _train_parentnode(data, labels, j, 
                                                       n_neighbors, dynamic_neighbors,
-                                                      distkNN, gpu=gpu)
+                                                      distkNN, gpu=gpu, compress=compress)
             group[np.where(group_new == 1)[0]] = 1
             labels[np.where(group_new == 1)[0]] = labels_new[np.where(group_new == 1)[0]]
         if n.name != None:
@@ -215,7 +220,7 @@ def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN,
             if len(n.descendants) == 1:
                 group[np.squeeze(np.isin(labels, n.name))] = 1
             # train_knn 
-            _train_knn(data,labels,group,n,n_neighbors,dynamic_neighbors,distkNN,gpu=gpu)
+            _train_knn(data,labels,group,n,n_neighbors,dynamic_neighbors,distkNN,gpu=gpu, compress=compress)
             # rename all group == 1 to node.name
             group[np.squeeze(np.isin(labels, n.name))] = 1
             labels[group==1] = n.name[0]
@@ -275,7 +280,7 @@ def _train_svm(data, labels, group, n):
     n.set_classifier(clf) #save classifier to the node
     
 
-def _train_knn(data, labels, group, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None):
+def _train_knn(data, labels, group, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None, compress=False):
     '''Train a linear svm and attach to the node
     
         Parameters:
@@ -304,7 +309,7 @@ def _train_knn(data, labels, group, n, n_neighbors, dynamic_neighbors, distkNN,
     try:
         import faiss
         from .faissKNeighbors import FaissKNeighbors 
-        clf = FaissKNeighbors(k=k, gpu=gpu)
+        clf = FaissKNeighbors(k=k, gpu=gpu, compress=compress)
         clf.fit(data_knn, labels_knn)
         #print('Using FAISS library')
 

From a209a9b8c28c5d4930f396fc1072755b5fea7744 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Tue, 16 Apr 2024 16:48:06 +0200
Subject: [PATCH 06/12] Add GPU support to prediction

---
 scHPL/faissKNeighbors.py | 7 +++++--
 scHPL/predict.py         | 8 +++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/scHPL/faissKNeighbors.py b/scHPL/faissKNeighbors.py
index dd6a9d6..2e52364 100644
--- a/scHPL/faissKNeighbors.py
+++ b/scHPL/faissKNeighbors.py
@@ -23,12 +23,15 @@ def fit(self, X, y):
             self.index = faiss.IndexIVFPQ(self.index, X.shape[1], 100, 16, 8)
 
         if self.gpu is not None:
-            res = faiss.StandardGpuResources()
-            self.index = faiss.index_cpu_to_gpu(res, self.gpu, self.index)
+            self.to_gpu(self.gpu)
 
         self.index.add(X.astype(np.float32))
         self.y = y
 
+    def to_gpu(self, gpu):
+        res = faiss.StandardGpuResources()
+        self.index = faiss.index_cpu_to_gpu(res, gpu, self.index)
+
     def predict(self, X):
         distances, indices = self.index.search(X.astype(np.float32), k=self.k)
         votes = self.y[indices]
diff --git a/scHPL/predict.py b/scHPL/predict.py
index 590f7ec..4e45b45 100644
--- a/scHPL/predict.py
+++ b/scHPL/predict.py
@@ -11,7 +11,8 @@
 
 def predict_labels(testdata, 
                    tree: TreeNode, 
-                   threshold: float = 0.5):
+                   threshold: float = 0.5,
+                   gpu=None):
     '''Use the trained tree to predict the labels of a new dataset. 
     
         Parameters
@@ -51,6 +52,11 @@ def predict_labels(testdata,
         pca, pcs = tree[0].get_pca()
         testdata = pca.transform(testdata)
         dimred = True
+
+    if (tree[0].classifier and 
+        tree[0].classifier.__class__ == FaissKNeighbors and 
+        gpu is not None):
+        tree[0].classifier.to_gpu(gpu)
     
     labels_all = []
     prob_all = np.zeros((np.shape(testdata)[0],1))

From 94d3a5f6243d93ca048d46c0bfb2fe85a0025125 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Tue, 16 Apr 2024 16:50:11 +0200
Subject: [PATCH 07/12] Add tqdm support to prediction

---
 scHPL/predict.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/scHPL/predict.py b/scHPL/predict.py
index 4e45b45..3ba174a 100644
--- a/scHPL/predict.py
+++ b/scHPL/predict.py
@@ -7,6 +7,12 @@
 import numpy as np
 from numpy import linalg as LA
 from .utils import TreeNode
+from .faissKNeighbors import FaissKNeighbors
+try:
+    from tqdm import tqdm
+except ImportError:
+    def tqdm(x):
+        return x
 # from utils import TreeNode
 
 def predict_labels(testdata, 
@@ -60,7 +66,7 @@ def predict_labels(testdata,
     
     labels_all = []
     prob_all = np.zeros((np.shape(testdata)[0],1))
-    for idx, testpoint in enumerate(testdata):
+    for idx, testpoint in enumerate(tqdm(testdata)):
         if useRE:   
             if rej_RE[idx]:
                 labels_all.append('Rejected (RE)')

From 1150261611d9cd5d6c9294613a4a4d317203d873 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Tue, 16 Apr 2024 17:02:47 +0200
Subject: [PATCH 08/12] Revert "Version bump"

This reverts commit a59e895fee5c80da04a56a0bf176e90d402f6885.
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8f0328b..dfde56c 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 
 setuptools.setup(
     name="scHPL", 
-    version="1.0.4",
+    version="1.0.3",
     author="Lieke Michielsen",
     author_email="l.c.m.michielsen@tudelft.nl",
     description="Hierarchical progressive learning pipeline for single-cell RNA-sequencing datasets",

From 2f988d9c762680cb6e39b97b8d6d9a2b9dddacb5 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Tue, 16 Apr 2024 17:03:08 +0200
Subject: [PATCH 09/12] Revert "Loosen pandas dependency"

This reverts commit e75ff9f7cb5d084543d2bd061d817e6cd047cb18.
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index dfde56c..7205552 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@
         "numpy>=1.19.2",
         "scipy>=1.5.2",
         "scikit-learn>=0.23.2",
-        "pandas>=1.1.2,<=2.2.2",
+        "pandas>=1.1.2,<2.0.0",
         "newick~=1.0.0",
         "anndata>=0.7.4",
         "matplotlib>=3.3.1",

From bbb0f49baf27c76e4c65c1aba6f7d0b98763b721 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Tue, 16 Apr 2024 17:30:19 +0200
Subject: [PATCH 10/12] Remove option for disabling tree printing

---
 scHPL/learn.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/scHPL/learn.py b/scHPL/learn.py
index 967f2d2..8df860b 100644
--- a/scHPL/learn.py
+++ b/scHPL/learn.py
@@ -9,7 +9,7 @@
 from anndata import AnnData
 
 from .train import train_tree
-from .utils import TreeNode, create_tree, print_tree as print_tree_func
+from .utils import TreeNode, create_tree, print_tree
 from .predict import predict_labels
 from .update import update_tree
 # from train import train_tree
@@ -41,7 +41,6 @@ def learn_tree(data: AnnData,
                match_threshold: float = 0.25,
                attach_missing: bool = False,
                print_conf: bool = False,
-               print_tree: bool = True,
                gpu: Optional[int] = None,
                compress: bool = False
 ):
@@ -96,8 +95,6 @@ def learn_tree(data: AnnData,
             If 'True' missing nodes are attached to the root node.
         print_conf: Boolean = False
             Whether to print the confusion matrices during the matching step.
-        print_tree: Boolean = True
-            Whether to print the tree during the training.
         gpu: int = None
             GPU index to use for the Faiss library (only used when classifier='knn')
         compress: Boolean = False
@@ -129,9 +126,8 @@ def learn_tree(data: AnnData,
     labels_1 = labels[idx_1]
     data_1 = xx[idx_1]
     
-    if print_tree:
-        print('Starting tree:')
-        print_tree_func(tree)
+    print('Starting tree:')
+    print_tree(tree)
     
     for b in batch_order:
         
@@ -170,9 +166,8 @@ def learn_tree(data: AnnData,
         
         missing_pop.extend(mis_pop)
         
-        if print_tree:
-            print('\nUpdated tree:')
-            print_tree_func(tree, np.unique(labels_2))
+        print('\nUpdated tree:')
+        print_tree(tree, np.unique(labels_2))
         
         #concatenate the two datasets
         data_1 = np.concatenate((data_1, data_2), axis = 0)

From 6d931a9cbf8dd8e9d96b89b84b661b61f039c43c Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Tue, 16 Apr 2024 18:42:37 +0200
Subject: [PATCH 11/12] Fix problems with underlines in bold text

---
 scHPL/utils.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/scHPL/utils.py b/scHPL/utils.py
index cf9f3d1..f6d52cb 100644
--- a/scHPL/utils.py
+++ b/scHPL/utils.py
@@ -371,20 +371,15 @@ def _print_node(node, hor, ver_steps, fig, new_nodes):
     x, y = ([np.max([0.05, hor-0.045]), hor], [ver, ver])
     line = mlines.Line2D(x,y, lw=1)
     fig.add_artist(line)
-    
-    # Add textbox
-    if np.isin(node.name[0], new_nodes):
-        txt = r"$\bf{" + node.name[0] + "}$"
-    else:
-        txt = node.name[0]
-    
-    for n in node.name:
-        if(n != node.name[0]):
-            if np.isin(n, new_nodes):
-                txt = txt + ' & ' + r"$\bf{" + n + "}$"
-            else:
-                txt = txt + ' & ' + n
-                
+
+    def format_node(name):
+        if np.isin(name, new_nodes):
+            return r"$\bf{" + name.replace("_", "\_") + "}$"
+        else:
+            return name
+
+    txt = " & ".join([format_node(n) for n in node.name])
+
     fig.text(hor,ver, txt, size=10,
              ha = 'left', va='center',
              bbox = dict(boxstyle='round', fc='w', ec='k'))

From ac6652ddcde596e2f0da2f22d0eedb53e2d98a3b Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Wed, 17 Apr 2024 11:44:12 +0200
Subject: [PATCH 12/12] Revert "Add option for compressed FAISS index"

This reverts commit 64a2f6a508382b865d1d0e8da92a5f758d331eed.
---
 scHPL/faissKNeighbors.py |  7 +------
 scHPL/learn.py           |  9 +++------
 scHPL/train.py           | 19 +++++++------------
 3 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/scHPL/faissKNeighbors.py b/scHPL/faissKNeighbors.py
index 2e52364..0ac24ae 100644
--- a/scHPL/faissKNeighbors.py
+++ b/scHPL/faissKNeighbors.py
@@ -9,19 +9,14 @@
 import numpy as np
 
 class FaissKNeighbors:
-    def __init__(self, k=50, gpu=None, compress=False):
+    def __init__(self, k=50, gpu=None):
         self.index = None
         self.y = None
         self.k = k
         self.gpu = gpu
-        self.compress = compress
 
     def fit(self, X, y):
         self.index = faiss.IndexFlatL2(X.shape[1])
-
-        if self.compress:
-            self.index = faiss.IndexIVFPQ(self.index, X.shape[1], 100, 16, 8)
-
         if self.gpu is not None:
             self.to_gpu(self.gpu)
 
diff --git a/scHPL/learn.py b/scHPL/learn.py
index 8df860b..b8c7da0 100644
--- a/scHPL/learn.py
+++ b/scHPL/learn.py
@@ -41,8 +41,7 @@ def learn_tree(data: AnnData,
                match_threshold: float = 0.25,
                attach_missing: bool = False,
                print_conf: bool = False,
-               gpu: Optional[int] = None,
-               compress: bool = False
+               gpu: Optional[int] = None
 ):
     
     '''Learn a classification tree based on multiple labeled datasets.
@@ -97,8 +96,6 @@ def learn_tree(data: AnnData,
             Whether to print the confusion matrices during the matching step.
         gpu: int = None
             GPU index to use for the Faiss library (only used when classifier='knn')
-        compress: Boolean = False
-            If 'True', the Faiss index is compressed (only used when classifier='knn')
             
         Returns
         -------
@@ -143,13 +140,13 @@ def learn_tree(data: AnnData,
         if retrain:
             tree = train_tree(data_1, labels_1, tree, classifier, 
                               dimred, useRE, FN, n_neighbors, dynamic_neighbors,
-                              distkNN, gpu=gpu, compress=compress)
+                              distkNN, gpu=gpu)
         else:
             retrain = True 
         
         tree_2 = train_tree(data_2, labels_2, tree_2, classifier, 
                             dimred, useRE, FN, n_neighbors, dynamic_neighbors,
-                            distkNN, gpu=gpu, compress=compress)
+                            distkNN, gpu=gpu)
         
         # Predict labels other dataset
         labels_2_pred,_ = predict_labels(data_2, tree, threshold=rej_threshold)
diff --git a/scHPL/train.py b/scHPL/train.py
index 594c9f0..a8746a1 100644
--- a/scHPL/train.py
+++ b/scHPL/train.py
@@ -35,8 +35,7 @@ def train_tree(data,
                n_neighbors: int = 50,
                dynamic_neighbors: bool = True,
                distkNN: int = 99,
-               gpu: Optional[int] = None,
-               compress: bool = False):
+               gpu: Optional[int] = None):
     '''Train a hierarchical classifier. 
     
         Parameters
@@ -70,9 +69,6 @@ def train_tree(data,
             set
         gpu: int | None = None
             GPU index to use for the Faiss library (only used when classifier='knn')
-        compress: bool = False
-            If 'True', the Faiss library will use a compressed index for the kNN 
-            classifier.
 
         
         Returns
@@ -136,7 +132,7 @@ def train_tree(data,
         except:
             None
         _,_ = _train_parentnode(data, labels_train, tree[0], n_neighbors, 
-                                dynamic_neighbors, distkNN, gpu=gpu, compress=compress)
+                                dynamic_neighbors, distkNN, gpu=gpu)
     else:
         for n in tree[0].descendants:
             _ = _train_node(data, labels, n, classifier, dimred, numgenes)
@@ -182,7 +178,7 @@ def _train_node(data, labels, n, classifier, dimred, numgenes):
         
     return group
 
-def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None, compress=False):
+def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None):
     '''Train a knn classifier. In contrast to the linear svm and oc svm, this 
         is trained for each parent node instead of each child node
         
@@ -195,7 +191,6 @@ def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN,
         dimred: dimensionality reduction
         numgenes: number of genes in the training data
         gpu: GPU index to use for the Faiss library (only used when classifier='knn')
-        compress: If 'True', the Faiss library will use a compressed index for the kNN classifier.
         
         Return
         ------
@@ -212,7 +207,7 @@ def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN,
         for j in n.descendants:
             group_new, labels_new = _train_parentnode(data, labels, j, 
                                                       n_neighbors, dynamic_neighbors,
-                                                      distkNN, gpu=gpu, compress=compress)
+                                                      distkNN, gpu=gpu)
             group[np.where(group_new == 1)[0]] = 1
             labels[np.where(group_new == 1)[0]] = labels_new[np.where(group_new == 1)[0]]
         if n.name != None:
@@ -220,7 +215,7 @@ def _train_parentnode(data, labels, n, n_neighbors, dynamic_neighbors, distkNN,
             if len(n.descendants) == 1:
                 group[np.squeeze(np.isin(labels, n.name))] = 1
             # train_knn 
-            _train_knn(data,labels,group,n,n_neighbors,dynamic_neighbors,distkNN,gpu=gpu, compress=compress)
+            _train_knn(data,labels,group,n,n_neighbors,dynamic_neighbors,distkNN,gpu=gpu)
             # rename all group == 1 to node.name
             group[np.squeeze(np.isin(labels, n.name))] = 1
             labels[group==1] = n.name[0]
@@ -280,7 +275,7 @@ def _train_svm(data, labels, group, n):
     n.set_classifier(clf) #save classifier to the node
     
 
-def _train_knn(data, labels, group, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None, compress=False):
+def _train_knn(data, labels, group, n, n_neighbors, dynamic_neighbors, distkNN, gpu=None):
     '''Train a linear svm and attach to the node
     
         Parameters:
@@ -309,7 +304,7 @@ def _train_knn(data, labels, group, n, n_neighbors, dynamic_neighbors, distkNN,
     try:
         import faiss
         from .faissKNeighbors import FaissKNeighbors 
-        clf = FaissKNeighbors(k=k, gpu=gpu, compress=compress)
+        clf = FaissKNeighbors(k=k, gpu=gpu)
         clf.fit(data_knn, labels_knn)
         #print('Using FAISS library')