-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfiguration.yaml
80 lines (72 loc) · 2.79 KB
/
configuration.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#### Upstream analysis ####
# Working directory for mutational analysis
WORK_GENE_DIR: "scripts_gene_analysis/"
# Dataset for the analysis of somatic mutations
DATASET: "datasets/full_columns_data.tsv"
# Directory for Leukemia gene lists
GENE_FILES_FOLDER: "scripts_gene_analysis/gene_folder/"
# Output directory for Leukemia gene lists
HALLMARK_RESULTS: "scripts_gene_analysis/hallmark/"
# Directory for Over-representation analysis
ENRICH_FOLDER: "scripts_gene_analysis/hallmark/ora_results/"
# Directory for Over-representation analysis: first 15 most significant pathways
ENR_15_FOLDER: "scripts_gene_analysis/hallmark/ern_res_p_values_15/"
# Directory for the plots
ENR_PLOTS: "scripts_gene_analysis/hallmark/enr_plots/"
# AML files to check if files exist in ENRICH_FOLDER
AML_ENRICH: "scripts_gene_analysis/hallmark/ora_results/Acute Lymphoblastic Leukemia_enrichment_results.csv"
# AML files to check if files exist in ENR_15_FOLDER
AML_ENRICH_15: "scripts_gene_analysis/hallmark/ern_res_p_values_15/LAML_gene_list_top_enriched_pathways.tsv"
# AML file: Check if file exists in ENR_PLOTS
AML_PLOT: "scripts_gene_analysis/hallmark/enr_plots/Acute Myeloid Leukemia_plot.png"
#### Downstream analysis ####
# Seed for reproducibility
SEED: 42
# Data path
WORK_DIR: 'Somatic-Mutations/'
DATASET: 'datasets/full_data.tsv'
OUTPUT_DIR: 'result_files/'
# Correlation matrix cramers_v
CORR_IMAGE: 'result_files/categorical_correlation.png'
CORR_RESULTS: 'result_files/correlation_results.txt'
# Feature Selection
ANOVA: 'result_files/ANOVA_F_value_feature_importance.png'
MIT: 'result_files/mutual_info_class.png'
# Optimal number of trees
N_OPTIMAL_TREES: 200
# Lazy predict results
LZP_RESULTS: 'result_files/lzp_results.tsv'
# Random Forest results
CROSS_VAL: 'result_files/cv_score.txt'
ACCURACY: 'result_files/accuracy.txt'
RF_BEST_PARAMS: 'result_files/rf_best_params.txt'
REPORT_RF: 'result_files/class_report.txt'
CONFUSION_MTX: 'result_files/confusion_mtx.txt'
# Best parameters
BEST_PARAMS: 'result_files/best_params.txt'
# Epochs
EPOCHS: 20
# Multilayer Perceptron
MLP_RESULTS: 'result_files/mlp_folder/validation_results.txt'
# MLP Hyperparameters optimization
PARAM_GRID: {
"dropout_rate": [0.1, 0.2, 0.3, 0.4, 0.5],
"learning_rate": [0.001, 0.002, 0.003, 0.004, 0.005],
"batch_size": [35, 70, 100, 200, 500],
"neurons_1st_layer": [16, 32, 64, 128, 256],
"neurons_2nd_layer": [32, 64, 128, 256, 512]
}
# Random Forest Hyperparameters optimization
RF_PARAMS: {
'bootstrap': [True, False],
'max_features': ['log2', 'sqrt'],
'n_estimators': [50, 100, 200, 500, 1000, 1500, 2000]
}
# Final Parameters
PARAM_GRID: {
"dropout_rate": [0.1],
"learning_rate": [0.002],
"batch_size": [35],
"neurons_1st_layer": [16],
"neurons_2nd_layer": [32]
}