-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
125 lines (110 loc) · 4.18 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import argparse
import itertools
import json
import os
import pandas as pd
from src import utils
from src.models.feature_extraction import TfidfVectorizerModel
from src.models.experiments import NMFExperiment
from src.preprocessing import PreprocessingPipeline
def main():
parser = argparse.ArgumentParser(
description="Run the job posting analysis pipeline."
)
parser.add_argument(
"--company_name",
type=str,
required=True,
help="Name of the specific company to analyze.",
)
parser.add_argument(
"--kaggle_postings_file",
type=str,
required=True,
help="Name or path to the Kaggle postings CSV file.",
)
parser.add_argument(
"--companyA_file",
type=str,
required=True,
help="Name or path to the companyA job postings Excel file.",
)
args = parser.parse_args()
if os.path.exists("data/out/sample_data.csv"):
sample = pd.read_csv("data/out/sample_data.csv")
else:
KAGGLE_FILE = args.kaggle_postings_file
COMPANYA_FILE = args.companyA_file
IMPORTANT_COLUMNS = ["job_id", "title", "description", "company_name", "skills"]
COMPANY_NAME = args.company_name
# Load the data
relevant_roles = utils.read_text_list("data/preprocessing/relevant_tech.txt")
kaggle_postings = utils.load_data(KAGGLE_FILE)
companyA_data = utils.load_data(COMPANYA_FILE)
kaggle_postings["skills"] = None
companyA_data["company_name"] = COMPANY_NAME
companyA_data.rename(
columns={"Job Posting ID": "job_id", "Skills": "skills"}, inplace=True
)
print("Initializing the preprocessing pipeline...\n")
# Initialize the preprocessing pipeline
pipeline = PreprocessingPipeline(
[kaggle_postings, companyA_data], IMPORTANT_COLUMNS
)
print("Preprocessing the data...\n")
# Preprocess the data
data = pipeline.preprocess(
filter_keywords=True,
keyword_filepath="data/preprocessing/irrelevant_jobs.txt",
column_name="title",
get_skills=True,
skills_abr_filename="job_skills.csv",
skills_map_filename="skills.csv",
)
pipeline.update_sample(COMPANY_NAME, relevant_roles)
sample = pipeline.get_sample_data()
sample.to_csv("data/out/sample_data.csv", index=False)
print("Sample size:", sample.shape[0], sample.shape[1])
print("Columns:", sample.columns)
print("Relevant jobs:", sample[sample["is_irrelevant"] == 0].shape[0])
print("Irrelevant jobs:", sample[sample["is_irrelevant"] == 1].shape[0])
print("\nFinished running the pipeline.\n")
print("Start running the experiments...\n")
# Run the experiments
# Define parameter grid
param_grid = {
"n_components": list(range(5, 31)),
"init": ["nndsvd"],
"solver": ["mu"],
"beta_loss": ["kullback-leibler"],
"alpha_W": [0.00005],
"alpha_H": [0.00005],
"l1_ratio": [0.5],
"random_state": [42],
"max_iter": [1000],
"num_top_words": [20],
}
results = {}
output_dir = "data/out"
param_combinations = list(itertools.product(*param_grid.values()))
print("Total combinations:", len(param_combinations))
param_names = list(param_grid.keys())
stop_words = list(utils.read_text_list("data/preprocessing/stop_words.txt"))
for param_combination in param_combinations:
# print number of components
print(f"Running experiment with {param_combination[0]} components...")
params = dict(zip(param_names, param_combination))
params["vectorizer"] = TfidfVectorizerModel(
stop_words=stop_words,
max_df=0.95,
min_df=2,
)
experiment = NMFExperiment(params, output_dir)
coherence_score = experiment.run(sample["description"])
results[str(params)] = coherence_score
# Save results to a JSON file
results_filepath = os.path.join(output_dir, "nmf_grid_search_results.json")
with open(results_filepath, "w") as f:
json.dump(results, f, indent=4)
if __name__ == "__main__":
main()