From 9e88dcabb7390fead997c8998dc7176d459087f7 Mon Sep 17 00:00:00 2001 From: Alexander Berger Date: Tue, 18 Jun 2024 07:44:54 -0400 Subject: [PATCH] Adding sphinx configs and deployment definitions --- .dockerignore | 6 + Dockerfile | 2 + deploy/k8s/base/deployment.yaml | 9 +- pyproject.toml | 2 +- sample-configs/k8s/sphinx/sphinx.conf | 291 ++++++++++++++++++++++ sample-configs/k8s/sphinx/start_sphinx.sh | 17 ++ sample-configs/k8s/sphinx/stopwords.txt | 103 ++++++++ 7 files changed, 428 insertions(+), 2 deletions(-) create mode 100644 .dockerignore create mode 100644 sample-configs/k8s/sphinx/sphinx.conf create mode 100755 sample-configs/k8s/sphinx/start_sphinx.sh create mode 100644 sample-configs/k8s/sphinx/stopwords.txt diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..781e89c7 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,6 @@ +deploy/ +.github/ +docs/ +skaffold.yaml +.gitignore +.git diff --git a/Dockerfile b/Dockerfile index 352895c5..d73e0d0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,8 @@ ENV PATH="${POETRY_HOME}/bin:${PATH}" WORKDIR /app +COPY sample-configs/k8s/sphinx /app/sphinx/ + COPY pyproject.toml poetry.lock README.md /app/ RUN poetry install --sync --no-root diff --git a/deploy/k8s/base/deployment.yaml b/deploy/k8s/base/deployment.yaml index 9503b567..346b84a8 100644 --- a/deploy/k8s/base/deployment.yaml +++ b/deploy/k8s/base/deployment.yaml @@ -33,4 +33,11 @@ spec: - secretRef: name: geneweaver-legacy-secrets ports: - - containerPort: 8000 \ No newline at end of file + - containerPort: 8000 + - name: geneweaver-legacy-search + image: geneweaver-legacy + imagePullPolicy: Always + envFrom: + - secretRef: + name: geneweaver-db + command: ["/bin/bash", "-c", "/app/sphinx/start_sphinx.sh"] diff --git a/pyproject.toml b/pyproject.toml index eaee1323..827afb2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "geneweaver-legacy" -version = "1.1.1" +version = "1.2.0" description = "" authors = ["Alexander Berger "] readme = "README.md" diff --git a/sample-configs/k8s/sphinx/sphinx.conf b/sample-configs/k8s/sphinx/sphinx.conf new file mode 100644 index 00000000..4bc748c6 --- /dev/null +++ b/sample-configs/k8s/sphinx/sphinx.conf @@ -0,0 +1,291 @@ + +source geneset_src : base +{ + # potentially delete old entry, first + sql_query_pre = SET search_path to production,extsrc,odestatic; + sql_query_pre = DELETE FROM sphinxcounters WHERE index_name='geneset_tmp'; + sql_query_pre = INSERT INTO sphinxcounters VALUES ('geneset_tmp', NOW()); + + sql_query_range = \ + SELECT min(gs.gs_id), max(gs.gs_id) \ + FROM geneset gs \ + WHERE gs.gs_status<>'deleted' \ + AND gs.gs_updated < ( \ + SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp'); + sql_range_step = 1000 + + sql_query = \ + SELECT gs.gs_id, gs.gs_id, gs.gs_id as gs_id_attr, 'GS'||gs.gs_id AS gsid_prefixed, \ + gs.gs_name as name, gs.gs_description as description, \ + gs.gs_abbreviation as label, gs.usr_id, gs.gs_count, \ + p.pub_pubmed AS pubmed_id, p.pub_authors, p.pub_title, p.pub_abstract, p.pub_journal, \ + sp.sp_name as species, sp.sp_taxid as taxid, \ + COALESCE(gs.cur_id, 0) AS cur_id, COALESCE(gs.sp_id, 0) AS sp_id, \ + COALESCE(gs.gs_attribution, 0) as attribution, \ + CASE \ + WHEN gs.gs_status='provisional' THEN 1 \ + WHEN gs.gs_status LIKE 'deprecated%' THEN 2 \ + ELSE 0 \ + END AS gs_status, \ + CASE \ + WHEN gs.sp_id=1 THEN 'mouse' \ + WHEN gs.sp_id=2 THEN 'human' \ + WHEN gs.sp_id=3 THEN 'rat' \ + WHEN gs.sp_id=4 THEN 'zebrafish' \ + WHEN gs.sp_id=5 THEN 'fly' \ + WHEN gs.sp_id=6 THEN 'monkey' \ + WHEN gs.sp_id=8 THEN 'c. elegans' \ + WHEN gs.sp_id=9 THEN 'yeast' \ + END AS common_name \ + FROM geneset gs \ + LEFT OUTER JOIN publication p USING(pub_id) \ + LEFT OUTER JOIN species sp USING(sp_id) \ + WHERE gs.gs_status<>'deleted' \ + AND gs.gs_id>=$start AND gs.gs_id<=$end \ + AND gs.gs_updated < ( \ + SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp'); + + # gene ref ids, gene names, and type + # ode_ref_id||' '||COALESCE(gi_name, '')||' '||COALESCE(gi_type, '') \ + + # just gene ref ids + sql_joined_field = genes from query; \ + SELECT gs.gs_id, ode_ref_id \ + FROM geneset gs \ + JOIN geneset_value USING(gs_id) \ + JOIN gene USING(ode_gene_id) \ + JOIN gene_info USING(ode_gene_id) \ + WHERE gs.gs_status<>'deleted' AND gsv_in_threshold AND ode_pref \ + AND gs.gs_updated < ( \ + SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp') \ + ORDER BY gs.gs_id ASC; + + # add in ontology term associations + sql_joined_field = ontologies from query; \ + SELECT gso.gs_id, o.ont_ref_id||' '||o.ont_name||' '||o.ont_description \ + FROM ontology o, geneset_ontology gso, geneset gs \ + WHERE gso.gso_ref_type!='Blacklist' \ + AND gso.ont_id=o.ont_id \ + AND gso.gs_id=gs.gs_id AND gs.gs_status<>'deleted' \ + AND gs.gs_updated < ( \ + SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp') \ + ORDER BY gs_id ASC; + + # both indexed and attributed + sql_field_string = common_name + sql_field_string = species + sql_field_string = taxid + sql_field_string = pubmed_id + + sql_attr_uint = sp_id + sql_attr_uint = usr_id + sql_attr_uint = cur_id + sql_attr_uint = attribution + sql_attr_uint = gs_status + sql_attr_uint = gs_count + sql_attr_uint = gs_id_attr + + sql_attr_multi = uint grp_id from query; \ + SELECT gs_id, regexp_split_to_table(gs_groups, ',')::integer AS grp_id \ + FROM production.geneset \ + WHERE gs_status<>'deleted' AND (gs_groups <> '') IS NOT FALSE AND gs_updated < ( \ + SELECT last_update FROM production.sphinxcounters WHERE index_name='geneset_tmp') \ + ORDER BY gs_id ASC; + + sql_query_post_index = SET search_path to production,extsrc,odestatic; + sql_query_post_index = DELETE FROM sphinxcounters WHERE index_name='geneset'; + sql_query_post_index = UPDATE sphinxcounters \ + SET index_name='geneset' WHERE index_name='geneset_tmp'; + sql_query_post_index = DELETE FROM genesetklist WHERE ts < ( \ + SELECT last_update FROM sphinxcounters WHERE index_name='geneset'); +} + +source geneset_delta_src : base +{ + sql_query_pre = SET search_path to production,extsrc,odestatic; + + sql_query = \ + SELECT gs.gs_id, gs.gs_id, gs.gs_id as gs_id_attr, 'GS'||gs.gs_id AS gsid_prefixed, \ + gs.gs_name as name, gs.gs_description as description, \ + gs.gs_abbreviation as label, gs.usr_id, gs.gs_count, \ + p.pub_pubmed AS pubmed_id, p.pub_authors, p.pub_title, p.pub_abstract, p.pub_journal, \ + sp.sp_name as species, sp.sp_taxid as taxid, \ + COALESCE(gs.cur_id, 0) AS cur_id, COALESCE(gs.sp_id, 0) AS sp_id, \ + COALESCE(gs.gs_attribution, 0) as attribution, \ + CASE \ + WHEN gs.gs_status='provisional' THEN 1 \ + WHEN gs.gs_status LIKE 'deprecated%' THEN 2 \ + ELSE 0 \ + END AS gs_status, \ + CASE \ + WHEN gs.sp_id=1 THEN 'mouse' \ + WHEN gs.sp_id=2 THEN 'human' \ + WHEN gs.sp_id=3 THEN 'rat' \ + WHEN gs.sp_id=4 THEN 'zebrafish' \ + WHEN gs.sp_id=5 THEN 'fly' \ + WHEN gs.sp_id=6 THEN 'monkey' \ + WHEN gs.sp_id=8 THEN 'c. elegans' \ + WHEN gs.sp_id=9 THEN 'yeast' \ + END AS common_name \ + FROM geneset gs \ + LEFT OUTER JOIN publication p USING(pub_id) \ + LEFT OUTER JOIN species sp USING(sp_id) \ + WHERE gs.gs_status<>'deleted' \ + AND gs.gs_updated >= ( \ + SELECT last_update FROM sphinxcounters WHERE index_name='geneset'); + + # gene ref ids, gene names, and type + # ode_ref_id||' '||COALESCE(gi_name, '')||' '||COALESCE(gi_type, '') \ + + # just gene ref ids + sql_joined_field = genes from query; \ + SELECT gs.gs_id, ode_ref_id \ + FROM geneset gs \ + JOIN geneset_value USING(gs_id) \ + JOIN gene USING(ode_gene_id) \ + JOIN gene_info USING(ode_gene_id) \ + WHERE gs.gs_status<>'deleted' AND gsv_in_threshold AND ode_pref \ + AND gs.gs_updated >= ( \ + SELECT last_update FROM sphinxcounters WHERE index_name='geneset') \ + ORDER BY gs.gs_id ASC; + + # add in ontology term associations + sql_joined_field = ontologies from query; \ + SELECT gso.gs_id, o.ont_ref_id||' '||o.ont_name||' '||o.ont_description \ + FROM ontology o, geneset_ontology gso, geneset gs \ + WHERE gso.gso_ref_type!='Blacklist' \ + AND gso.ont_id=o.ont_id \ + AND gso.gs_id=gs.gs_id AND gs.gs_status<>'deleted' \ + AND gs.gs_updated >= ( \ + SELECT last_update FROM sphinxcounters WHERE index_name='geneset') \ + ORDER BY gs_id ASC; + + # both indexed and attributed + sql_field_string = common_name + sql_field_string = species + sql_field_string = taxid + sql_field_string = pubmed_id + + sql_attr_uint = sp_id + sql_attr_uint = usr_id + sql_attr_uint = cur_id + sql_attr_uint = attribution + sql_attr_uint = gs_status + sql_attr_uint = gs_count + sql_attr_uint = gs_id_attr + + sql_attr_multi = uint grp_id from query; \ + SELECT gs_id, regexp_split_to_table(gs_groups, ',')::integer AS grp_id \ + FROM production.geneset \ + WHERE gs_status<>'deleted' AND gs_updated >= ( \ + SELECT last_update FROM production.sphinxcounters WHERE index_name='geneset') \ + ORDER BY gs_id ASC; + + sql_query_killlist = \ + SELECT gs_id FROM geneset WHERE gs_updated >= ( \ + SELECT last_update FROM sphinxcounters WHERE index_name='geneset') \ + UNION SELECT gs_id from genesetklist; +} + +############################################################################# +## index definitions +############################################################################# + +index geneset +{ + source = geneset_src + path = /app/sphinx/geneset_idx + + morphology = stem_en + + # wordforms file, in "mapfrom > mapto" plain text format + # optional, default is empty + # + # wordforms = ../var/data/wordforms.txt + + # minimum indexed word length; default is 1 (index everything) + min_word_len = 1 + + html_strip = 1 + index_exact_words = 1 + + stopwords = /app/sphinx/stopwords.txt + + min_prefix_len = 1 + enable_star = 1 # allow for partial results +} + +index geneset_delta +{ + source = geneset_delta_src + path = /app/sphinx/geneset_delta_idx + + morphology = stem_en + + # wordforms file, in "mapfrom > mapto" plain text format + # optional, default is empty + # + # wordforms = ../var/data/wordforms.txt + + # minimum indexed word length; default is 1 (index everything) + min_word_len = 1 + + html_strip = 1 + index_exact_words = 1 + + stopwords = /app/sphinx/stopwords.txt + + min_prefix_len = 1 + enable_star = 1 # allow for partial results +} + +############################################################################# +## indexer settings +############################################################################# + +indexer +{ + mem_limit = 2048M +} + +############################################################################# +## searchd settings +############################################################################# + +searchd +{ + # hostname, port, or hostname:port, or /unix/socket/path to listen on + # multi-value, multiple listen points are allowed + # optional, default is 0.0.0.0:9312 (listen on all interfaces, port 9312) + # + # listen = 127.0.0.1 + # listen = 192.168.0.1:9312 + # listen = 9312 + # listen = /var/run/searchd.sock + + + # log file, searchd run info is logged here + # optional, default is 'searchd.log' + log = /app/sphinx/sphinx-searchd.log + + # query log file, all search queries are logged here + # optional, default is empty (do not log queries) + query_log = /app/sphinx/sphinx-query.log + + # maximum amount of children to fork (concurrent searches to run) + # optional, default is 0 (unlimited) + max_children = 30 + + # PID file, searchd process ID file name + # mandatory + pid_file = /app/sphinx/sphinx-searchd.pid + + # max amount of matches the daemon ever keeps in RAM, per-index + # WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL + # default is 1000 (just like Google) + max_matches = 1000 + max_packet_size = 64M + + # avoid deprecation warning + #compat_sphinxql_magics = 0 +} diff --git a/sample-configs/k8s/sphinx/start_sphinx.sh b/sample-configs/k8s/sphinx/start_sphinx.sh new file mode 100755 index 00000000..254c13ee --- /dev/null +++ b/sample-configs/k8s/sphinx/start_sphinx.sh @@ -0,0 +1,17 @@ +{ + echo "source base" + echo "{" + echo " type = pgsql" + echo " sql_host = $DB_HOST" + echo " sql_user = $DB_USERNAME" + echo " sql_pass = $DB_PASSWORD" + echo " sql_db = $DB_NAME" + echo "}" + cat /app/sphinx/sphinx.conf +} > /app/sphinx/sphinx.conf.new + +mv /app/sphinx/sphinx.conf.new /app/sphinx/sphinx.conf + +indexer --all --config /app/sphinx/sphinx.conf + +searchd --nodetach --config /app/sphinx/sphinx.conf diff --git a/sample-configs/k8s/sphinx/stopwords.txt b/sample-configs/k8s/sphinx/stopwords.txt new file mode 100644 index 00000000..6fe45df3 --- /dev/null +++ b/sample-configs/k8s/sphinx/stopwords.txt @@ -0,0 +1,103 @@ +i +the +to +and +a +of +http +com +it +you +that +in +s +my +is +was +for +www +t +he +on +me +with +but +so +have +this +be +at +his +we +as +all +m +like +what +img +nbsp +are +out +up +she +do +her +they +or +if +by +had +one +your +about +can +from +there +get +when +him +no +now +would +then +don +will +been +some +an +who +how +going +them +got +well +am +were +because +ve +want +much +see +ll +d +has +over +re +into +only +which +other +too +here +could +even +than +off +did +their +amp +also +should +these +where +within