Specify the version of python; Enhancements.

allow for creating a series of directories to avoid file not found errors use relative imports within src folder.
rtrad89 · Jul 23, 2021 · 2ff7ff3 · 2ff7ff3
1 parent ff19232
commit 2ff7ff3
Show file tree

Hide file tree

Showing 5 changed files with 12 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 This code repository implements the approach presented in the associated [paper](https://rdcu.be/cjErU) ([preprint here](https://arxiv.org/abs/2011.15038)) to cluster a corpus of documents by authorship and produce the evaluated clustering on several fronts.
 
 # Requirements
-The code is developed mainly with Python 3. You can refer to `requirements.txt` file for necessary Python packages in order to run the code. Please pay special attention to `scikit-learn` version, **which should NOT be newer than 0.22.0** due to compatibility problems with Spherical K-Means implementation (cf. [this relevant issue](https://github.com/jasonlaska/spherecluster/issues/26)). In addition, I have manually patched the code of Spherical KMeans to circumvent a similar compatibility problem, as the release `0.1.7` didn't do so. Future official releases shall fix this but till then, the local version runs smoothly and there is no need to install `spherecluster` library.
+The code is developed mainly with Python 3.8.8. You can refer to `requirements.txt` file for necessary Python packages in order to run the code. Please pay special attention to `scikit-learn` version, **which should NOT be newer than 0.22.0** due to compatibility problems with Spherical K-Means implementation (cf. [this relevant issue](https://github.com/jasonlaska/spherecluster/issues/26)). In addition, I have manually patched the code of Spherical KMeans to circumvent a similar compatibility problem, as the release `0.1.7` didn't do so. Future official releases shall fix this but till then, the local version runs smoothly and there is no need to install `spherecluster` library.
 
 Moreover, the code depends on Hierarchical Dirichlet Process -- HDP as implemented in [blei-lab](https://github.com/blei-lab/hdp). The user should have the HDP code compiled properly in order to use it to produce the latent semantic representation of texts, aka. *LSSR*, for clustering.
 

diff --git a/cluster_docs.py b/cluster_docs.py
@@ -104,7 +104,7 @@ def save_results(results: List[Dict], k_pred: List[List],
 
     timestamp = pd.to_datetime("now").strftime("%Y%m%d_%H%M%S")
 
-    Tools.initialise_directory(out_dir, purge=False)
+    Tools.initialise_directories(out_dir)
 
     # Construct the results path
     save_path = Tools.get_path(

diff --git a/src/lss_modeller.py b/src/lss_modeller.py
@@ -13,7 +13,7 @@
 import time
 import pandas as pd
 from itertools import product
-from aiders import Tools
+from .aiders import Tools
 from typing import Tuple, List
 from collections import defaultdict
 import seaborn as sns
@@ -140,10 +140,10 @@ def _generate_lda_c_corpus(self):
         # Sterialise into LDA_C and store on disk
         output_dir = Tools.get_path(
             self.input_docs_path,
-            f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}",
+            f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}"
             f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}")
 
-        Tools.initialise_directory(output_dir)
+        Tools.initialise_directories(output_dir)
         save_location = Tools.get_path(
             output_dir, f"{self.lda_c_fname}.dat")
 
@@ -158,7 +158,7 @@ def _invoke_gibbs_hdp(self):
 
         param_data = Tools.get_path(
             self.input_docs_path,
-            f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}",
+            f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}"
             f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}",
             f"{self.lda_c_fname}.dat")
 

diff --git a/src/runall_test_data.py b/src/runall_test_data.py
@@ -3,9 +3,9 @@
 Run authorial clustering experiments on PAN-17 test dataset.
 
 """
-from lss_modeller import LssHdpModeller, LssBTModeller
-from aiders import Tools
-from clustering import Clusterer
+from .lss_modeller import LssHdpModeller, LssBTModeller
+from .aiders import Tools
+from .clustering import Clusterer
 from typing import List, Dict
 from time import perf_counter as tpc
 import warnings

diff --git a/src/runall_train_data.py b/src/runall_train_data.py
@@ -3,9 +3,9 @@
 Run authorial clustering experiments on PAN-17 train dataset.
 
 """
-from lss_modeller import LssHdpModeller
-from aiders import Tools
-from clustering import Clusterer
+from .lss_modeller import LssHdpModeller
+from .aiders import Tools
+from .clustering import Clusterer
 import warnings
 
 warnings.filterwarnings(action="ignore")  # Supress warning for this code file