-
Notifications
You must be signed in to change notification settings - Fork 73
/
geocode.rs
2161 lines (1946 loc) · 94.5 KB
/
geocode.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
static USAGE: &str = r#"
Geocodes a location in CSV data against an updatable local copy of the Geonames cities index.
When you run the command for the first time, it will download a prebuilt Geonames cities
index from the qsv GitHub repo and use it going forward. You can operate on the local
index using the index-* subcommands.
By default, the prebuilt index uses the Geonames Gazeteer cities15000.zip file using
English names. It contains cities with populations > 15,000 (about ~26k cities).
See https://download.geonames.org/export/dump/ for more information.
It has seven major subcommands:
* suggest - given a partial City name, return the closest City's location metadata
per the local Geonames cities index (Jaro-Winkler distance)
* suggestnow - same as suggest, but using a partial City name from the command line,
instead of CSV data.
* reverse - given a WGS-84 location coordinate, return the closest City's location
metadata per the local Geonames cities index.
(Euclidean distance - shortest distance "as the crow flies")
* reversenow - sames as reverse, but using a coordinate from the command line,
instead of CSV data.
* countryinfo - returns the country information for the ISO-3166 2-letter country code
(e.g. US, CA, MX, etc.)
* countryinfonow - same as countryinfo, but using a country code from the command line,
instead of CSV data.
* index-* - operations to update the local Geonames cities index.
(index-check, index-update, index-load & index-reset)
SUGGEST
Suggest a Geonames city based on a partial city name. It returns the closest Geonames
city record based on the Jaro-Winkler distance between the partial city name and the
Geonames city name.
The geocoded information is formatted based on --formatstr, returning it in
'%location' format (i.e. "(lat, long)") if not specified.
Use the --new-column option if you want to keep the location column:
Examples:
Geocode file.csv city column and set the geocoded value to a new column named lat_long.
$ qsv geocode suggest city --new-column lat_long file.csv
Limit suggestions to the US, Canada and Mexico.
$ qsv geocode suggest city --country us,ca,mx file.csv
Limit suggestions to New York State and California, with matches in New York state
having higher priority as its listed first.
$ qsv geocode suggest city --country us --admin1 "New York,US.CA" file.csv
If we use admin1 codes, we can omit --country as it will be inferred from the admin1 code prefix.
$ qsv geocode suggest city --admin1 "US.NY,US.CA" file.csv
Geocode file.csv city column with --formatstr=%state and set the
geocoded value a new column named state.
$ qsv geocode suggest city --formatstr %state --new-column state file.csv
Use dynamic formatting to create a custom format.
$ qsv geocode suggest city -f "{name}, {admin1}, {country} in {timezone}" file.csv
# using French place names. You'll need to rebuild the index with the --languages option first
$ qsv geocode suggest city -f "{name}, {admin1}, {country} in {timezone}" -l fr file.csv
SUGGESTNOW
Accepts the same options as suggest, but does not require an input file.
Its default format is more verbose - "{name}, {admin1} {country}: {latitude}, {longitude}"
$ qsv geocode suggestnow "New York"
$ qsv geocode suggestnow --country US -f %cityrecord "Paris"
$ qsv geocode suggestnow --admin1 "US:OH" "Athens"
REVERSE
Reverse geocode a WGS 84 coordinate to the nearest City. It returns the closest Geonames
city record based on the Euclidean distance between the coordinate and the nearest city.
It accepts "lat, long" or "(lat, long)" format.
The geocoded information is formatted based on --formatstr, returning it in
'%city-admin1' format if not specified.
Examples:
Reverse geocode file.csv LatLong column. Set the geocoded value to a new column named City.
$ qsv geocode reverse LatLong -c City file.csv
Reverse geocode file.csv LatLong column and set the geocoded value to a new column
named CityState, output to a file named file_with_citystate.csv.
$ qsv geocode reverse LatLong -c CityState file.csv -o file_with_citystate.csv
The same as above, but get the timezone instead of the city and state.
$ qsv geocode reverse LatLong -f %timezone -c tz file.csv -o file_with_tz.csv
REVERSENOW
Accepts the same options as reverse, but does not require an input file.
$ qsv geocode reversenow "40.71427, -74.00597"
$ qsv geocode reversenow --country US -f %cityrecord "40.71427, -74.00597"
$ qsv geocode reversenow --admin1 "US:OH" "(39.32924, -82.10126)"
COUNTRYINFO
Returns the country information for the specified ISO-3166 2-letter country code.
$ qsv geocode countryinfo country_col data.csv
$ qsv geocode countryinfo --formatstr "%json" country_col data.csv
$ qsv geocode countryinfo -f "%continent" country_col data.csv
$ qsv geocode countryinfo -f "{country_name} ({fips}) in {continent}" country_col data.csv
COUNTRYINFONOW
Accepts the same options as countryinfo, but does not require an input file.
$ qsv geocode countryinfonow US
$ qsv geocode countryinfonow --formatstr "%pretty-json" US
$ qsv geocode countryinfonow -f "%continent" US
$ qsv geocode countryinfonow -f "{country_name} ({fips}) in {continent}" US
INDEX-<operation>
Manage the local Geonames cities index used by the geocode command.
It has four operations:
* check - checks if the local Geonames index is up-to-date compared to the Geonames website.
returns the index file's metadata JSON to stdout.
* update - updates the local Geonames index with the latest changes from the Geonames website.
use this command judiciously as it downloads about ~200mb of data from Geonames
and rebuilds the index from scratch using the --languages option.
If you don't need a language other than English, use the index-load subcommand instead
as it's faster and will not download any data from Geonames.
* reset - resets the local Geonames index to the default prebuilt, English-only Geonames cities
index (cities15000) - downloading it from the qsv GitHub repo for the current qsv version.
* load - load a Geonames cities index from a file, making it the default index going forward.
If set to 500, 1000, 5000 or 15000, it will download the corresponding English-only
Geonames index bincode file from the qsv GitHub repo for the current qsv version.
Examples:
Update the Geonames cities index with the latest changes.
$ qsv geocode index-update
# or rebuild the index using the latest Geonames data
# with English, French, German & Spanish place names
$ qsv geocode index-update --languages en,fr,de,es
Load an alternative Geonames cities index from a file, making it the default index going forward.
$ qsv geocode index-load my_geonames_index.bincode
For more extensive examples, see https://github.com/dathere/qsv/blob/master/tests/test_geocode.rs.
Usage:
qsv geocode suggest [--formatstr=<string>] [options] <column> [<input>]
qsv geocode suggestnow [options] <location>
qsv geocode reverse [--formatstr=<string>] [options] <column> [<input>]
qsv geocode reversenow [options] <location>
qsv geocode countryinfo [options] <column> [<input>]
qsv geocode countryinfonow [options] <location>
qsv geocode index-load <index-file>
qsv geocode index-check
qsv geocode index-update [--languages=<lang>] [--cities-url=<url>] [--force] [--timeout=<seconds>]
qsv geocode index-reset
qsv geocode --help
geocode arguments:
<input> The input file to read from. If not specified, reads from stdin.
<column> The column to geocode. Used by suggest, reverse & countryinfo subcommands.
For suggest, it must be a column with a City string pattern.
For reverse, it must be a column using WGS 84 coordinates in
"lat, long" or "(lat, long)" format.
For countryinfo, it must be a column with a ISO 3166-1 alpha-2 country code.
Note that you can use column selector syntax to select the column, but only
the first column will be used. See `select --help` for more information.
<location> The location to geocode for suggestnow, reversenow & countryinfonow subcommands.
For suggestnow, its a City string pattern.
For reversenow, it must be a WGS 84 coordinate.
For countryinfonow, it must be a ISO 3166-1 alpha-2 code.
<index-file> The alternate geonames index file to use. It must be a .bincode file.
For convenience, if this is set to 500, 1000, 5000 or 15000, it will download
the corresponding English-only Geonames index bincode file from the qsv GitHub repo
for the current qsv version and use it. Only used by the index-load subcommand.
geocode options:
-c, --new-column <name> Put the transformed values in a new column instead. Not valid when
using the '%dyncols:' --formatstr option.
-r, --rename <name> New name for the transformed column.
--country <country_list> The comma-delimited, case-insensitive list of countries to filter for.
Country is specified as a ISO 3166-1 alpha-2 (two-letter) country code.
https://en.wikipedia.org/wiki/ISO_3166-2
It is the topmost priority filter, and will be applied first. If multiple
countries are specified, they are matched in priority order.
For suggest, this will limit the search to the specified countries.
For reverse, this ensures that the returned city is in the specified
countries (especially when geocoding coordinates near country borders).
If the coordinate is outside the specified countries, the returned city
will be the closest city as the crow flies in the specified countries.
SUGGEST only options:
--min-score <score> The minimum Jaro-Winkler distance score.
[default: 0.8]
--admin1 <admin1_list> The comma-delimited, case-insensitive list of admin1s to filter for.
If all uppercase, it will be treated as an admin1 code (e.g. US.NY, JP.40, CN.23).
Otherwise, it will be treated as an admin1 name (e.g New York, Tokyo, Shanghai).
Requires the --country option. However, if all admin1 codes have the same
prefix (e.g. US.TX, US.NJ, US.CA), the country can be inferred from the
admin1 code (in this example - US), and the --country option is not required.
If specifying multiple admin1 filters, you can mix admin1 codes and names,
and they are matched in priority order.
Matches are made using a starts_with() comparison (i.e. "US" will match "US.NY",
"US.NJ", etc. for admin1 code. "New" will match "New York", "New Jersey",
"Newfoundland", etc. for admin1 name.)
admin1 is the second priority filter, and will be applied after country filters.
See https://download.geonames.org/export/dump/admin1CodesASCII.txt for
recognized admin1 codes/names.
REVERSE only option:
-k, --k_weight <weight> Use population-weighted distance for reverse subcommand.
(i.e. nearest.distance - k * city.population)
Larger values will favor more populated cities.
If not set (default), the population is not used and the
nearest city is returned.
-f, --formatstr=<string> The place format to use. It has three options:
1. Use one of the predefined formats.
2. Use dynamic formatting to create a custom format.
3. Use the special format "%dyncols:" to dynamically add multiple
columns to the output CSV using fields from a geocode result.
PREDEFINED FORMATS:
- '%city-state' - e.g. Brooklyn, New York
- '%city-country' - Brooklyn, US
- '%city-state-country' | '%city-admin1-country' - Brooklyn, New York US
- '%city-county-state' | '%city-admin2-admin1' - Brooklyn, Kings County, New York
- '%city' - Brooklyn
- '%state' | '%admin1' - New York
- "%county' | '%admin2' - Kings County
- '%country' - US
- '%country_name' - United States
- '%cityrecord' - returns the full city record as a string
- '%admin1record' - returns the full admin1 record as a string
- '%admin2record' - returns the full admin2 record as a string
- '%lat-long' - <latitude>, <longitude>
- '%location' - (<latitude>, <longitude>)
- '%id' - the Geonames ID
- '%capital' - the capital
- '%continent' - the continent (only valid for countryinfo subcommand)
- '%population' - the population
- '%timezone' - the timezone
- '%json' - the full city record as JSON
- '%pretty-json' - the full city record as pretty JSON
- '%+' - use the subcommand's default format.
suggest - '%location'
suggestnow - '{name}, {admin1} {country}: {latitude}, {longitude}'
reverse & reversenow - '%city-admin1-country'
countryinfo - '%country_name'
If an invalid format is specified, it will be treated as '%+'.
Note that when using the JSON predefined formats with the now subcommands,
the output will be valid JSON, as the "Location" header will be omitted.
DYNAMIC FORMATTING:
Alternatively, you can use dynamic formatting to create a custom format.
To do so, set the --formatstr to a dynfmt template, enclosing field names
in curly braces.
The following ten cityrecord fields are available:
id, name, latitude, longitude, country, admin1, admin2, capital,
timezone, population
Fifteen additional countryinfo field are also available:
iso3, fips, area, country_population, continent, tld, currency_code,
currency_name, phone, postal_code_format, postal_code_regex, languages,
country_geonameid, neighbours, equivalent_fips_code
For US places, two additional fields are available:
us_county_fips_code and us_state_fips_code
e.g. "City: {name}, State: {admin1}, Country: {country} {continent} - {languages}"
If an invalid template is specified, "Invalid dynfmt template" is returned.
Both predefined and dynamic formatting are cached. Subsequent calls
with the same result will be faster as it will use the cached result instead
of searching the Geonames index.
DYNAMIC COLUMNS ("%dyncols:") FORMATTING:
Finally, you can use the special format "%dyncols:" to dynamically add multiple
columns to the output CSV using fields from a geocode result.
To do so, set --formatstr to "%dyncols:" followed by a comma-delimited list
of key:value pairs enclosed in curly braces.
The key is the desired column name and the value is one of the same fields
available for dynamic formatting.
e.g. "%dyncols: {city_col:name}, {state_col:admin1}, {county_col:admin2}"
will add three columns to the output CSV named city_col, state_col & county_col.
Note that using "%dyncols:" will cause the the command to geocode EACH row without
using the cache, so it will be slower than predefined or dynamic formatting.
Also, countryinfo and countryinfonow subcommands currently do not support "%dyncols:".
[default: %+]
-l, --language <lang> The language to use when geocoding. The language is specified as a ISO 639-1 code.
Note that the Geonames index must have been built with the specified language
using the `index-update` subcommand with the --languages option.
If the language is not available, the first language in the index is used.
[default: en]
--invalid-result <string> The string to return when the geocode result is empty/invalid.
If not set, the original value is used.
-j, --jobs <arg> The number of jobs to run in parallel.
When not set, the number of jobs is set to the number of CPUs detected.
-b, --batch <size> The number of rows per batch to load into memory, before running in parallel.
Set to 0 to load all rows in one batch.
[default: 50000]
--timeout <seconds> Timeout for downloading Geonames cities index.
[default: 120]
--cache-dir <dir> The directory to use for caching the Geonames cities index.
If the directory does not exist, qsv will attempt to create it.
If the QSV_CACHE_DIR envvar is set, it will be used instead.
[default: ~/.qsv-cache]
INDEX-UPDATE only options:
--languages <lang-list> The comma-delimited, case-insensitive list of languages to use when building
the Geonames cities index.
The languages are specified as a comma-separated list of ISO 639-2 codes.
See https://download.geonames.org/export/dump/iso-languagecodes.txt to look up codes
and https://download.geonames.org/export/dump/alternatenames/ for the supported
language files. 253 languages are currently supported.
[default: en]
--cities-url <url> The URL to download the Geonames cities file from. There are several
available at https://download.geonames.org/export/dump/.
cities500.zip - cities with populations > 500; ~200k cities, 56mb
cities1000.zip - population > 1000; ~140k cities, 44mb
cities5000.zip - population > 5000; ~53k cities, 21mb
cities15000.zip - population > 15000; ~26k cities, 13mb
Note that the more cities are included, the larger the local index file will be,
lookup times will be slower, and the search results will be different.
For convenience, if this is set to 500, 1000, 5000 or 15000, it will be
converted to a geonames cities URL.
[default: https://download.geonames.org/export/dump/cities15000.zip]
--force Force update the Geonames cities index. If not set, qsv will check if there
are updates available at Geonames.org before updating the index.
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
-p, --progressbar Show progress bars. Will also show the cache hit rate upon completion.
Not valid for stdin.
"#;
use std::{
collections::HashMap,
fs,
path::{Path, PathBuf},
};
use ahash::RandomState;
use cached::{proc_macro::cached, SizedCache};
use dynfmt2::Format;
use geosuggest_core::{
storage::{self, IndexStorage},
CitiesRecord, CountryRecord, Engine,
};
use geosuggest_utils::{IndexUpdater, IndexUpdaterSettings, SourceItem};
use indicatif::{ProgressBar, ProgressDrawTarget};
use log::info;
use phf::phf_map;
use rayon::{
iter::{IndexedParallelIterator, ParallelIterator},
prelude::IntoParallelRefIterator,
};
use regex::Regex;
use serde::Deserialize;
use serde_json::json;
use tempfile::tempdir;
use url::Url;
use util::expand_tilde;
use uuid::Uuid;
use crate::{
clitypes::CliError,
config::{Config, Delimiter},
regex_oncelock,
select::SelectColumns,
util,
util::replace_column_value,
CliResult,
};
#[derive(Deserialize)]
struct Args {
arg_column: String,
arg_location: String,
cmd_suggest: bool,
cmd_suggestnow: bool,
cmd_reverse: bool,
cmd_reversenow: bool,
cmd_countryinfo: bool,
cmd_countryinfonow: bool,
cmd_index_check: bool,
cmd_index_update: bool,
cmd_index_load: bool,
cmd_index_reset: bool,
arg_input: Option<String>,
arg_index_file: Option<String>,
flag_rename: Option<String>,
flag_country: Option<String>,
flag_min_score: Option<f32>,
flag_admin1: Option<String>,
flag_k_weight: Option<f32>,
flag_formatstr: String,
flag_language: String,
flag_invalid_result: Option<String>,
flag_batch: usize,
flag_timeout: u16,
flag_cache_dir: String,
flag_languages: String,
flag_cities_url: String,
flag_force: bool,
flag_jobs: Option<usize>,
flag_new_column: Option<String>,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
flag_progressbar: bool,
}
#[derive(Clone, Debug)]
struct Admin1Filter {
admin1_string: String,
is_code: bool,
}
#[derive(Clone)]
struct NamesLang {
cityname: String,
admin1name: String,
admin2name: String,
countryname: String,
}
static QSV_VERSION: &str = env!("CARGO_PKG_VERSION");
static DEFAULT_GEOCODE_INDEX_FILENAME: &str =
concat!("qsv-", env!("CARGO_PKG_VERSION"), "-geocode-index.bincode");
static DEFAULT_CITIES_NAMES_URL: &str =
"https://download.geonames.org/export/dump/alternateNamesV2.zip";
static DEFAULT_CITIES_NAMES_FILENAME: &str = "alternateNamesV2.txt";
static DEFAULT_COUNTRY_INFO_URL: &str = "https://download.geonames.org/export/dump/countryInfo.txt";
static DEFAULT_ADMIN1_CODES_URL: &str =
"https://download.geonames.org/export/dump/admin1CodesASCII.txt";
static DEFAULT_ADMIN2_CODES_URL: &str = "https://download.geonames.org/export/dump/admin2Codes.txt";
// we use a compile time static perfect hash map for US state FIPS codes
static US_STATES_FIPS_CODES: phf::Map<&'static str, &'static str> = phf_map! {
"AK" => "02",
"AL" => "01",
"AR" => "05",
"AZ" => "04",
"CA" => "06",
"CO" => "08",
"CT" => "09",
"DC" => "11",
"DE" => "10",
"FL" => "12",
"GA" => "13",
"HI" => "15",
"IA" => "19",
"ID" => "16",
"IL" => "17",
"IN" => "18",
"KS" => "20",
"KY" => "21",
"LA" => "22",
"MA" => "25",
"MD" => "24",
"ME" => "23",
"MI" => "26",
"MN" => "27",
"MO" => "29",
"MS" => "28",
"MT" => "30",
"NC" => "37",
"ND" => "38",
"NE" => "31",
"NH" => "33",
"NJ" => "34",
"NM" => "35",
"NV" => "32",
"NY" => "36",
"OH" => "39",
"OK" => "40",
"OR" => "41",
"PA" => "42",
"RI" => "44",
"SC" => "45",
"SD" => "46",
"TN" => "47",
"TX" => "48",
"UT" => "49",
"VT" => "50",
"VA" => "51",
"WA" => "53",
"WI" => "55",
"WV" => "54",
"WY" => "56",
// the following are territories
// and are not included in the default index
// leaving them here for reference
// "AS" => "60",
// "GU" => "66",
// "MP" => "69",
// "PR" => "72",
// "UM" => "74",
// "VI" => "78",
};
// max number of entries in LRU cache
static CACHE_SIZE: usize = 2_000_000;
// max number of entries in fallback LRU cache if we can't allocate CACHE_SIZE
static FALLBACK_CACHE_SIZE: usize = CACHE_SIZE / 4;
static INVALID_DYNFMT: &str = "Invalid dynfmt template.";
static INVALID_COUNTRY_CODE: &str = "Invalid country code.";
// when suggesting with --admin1, how many suggestions to fetch from the engine
// before filtering by admin1
static SUGGEST_ADMIN1_LIMIT: usize = 10;
// valid column values for %dyncols
// when adding new columns, make sure to maintain the sort order
// otherwise, the dyncols check will fail as it uses binary search
static SORTED_VALID_DYNCOLS: [&str; 28] = [
"admin1",
"admin2",
"area",
"capital",
"continent",
"country",
"country_geonameid",
"country_name",
"country_population",
"currency_code",
"currency_name",
"equivalent_fips_code",
"fips",
"id",
"iso3",
"languages",
"latitude",
"longitude",
"name",
"neighbours",
"phone",
"population",
"postal_code_format",
"postal_code_regex",
"timezone",
"tld",
"us_county_fips_code",
"us_state_fips_code",
];
// dyncols populated sentinel value
static DYNCOLS_POPULATED: &str = "_POPULATED";
// valid subcommands
#[derive(Clone, Copy, PartialEq)]
enum GeocodeSubCmd {
Suggest,
SuggestNow,
Reverse,
ReverseNow,
CountryInfo,
CountryInfoNow,
IndexCheck,
IndexUpdate,
IndexLoad,
IndexReset,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let mut args: Args = util::get_args(USAGE, argv)?;
if args.flag_new_column.is_some() && args.flag_rename.is_some() {
return fail_incorrectusage_clierror!(
"Cannot use --new-column and --rename at the same time."
);
}
if args.flag_new_column.is_some() && args.flag_formatstr.starts_with("%dyncols:") {
return fail_incorrectusage_clierror!(
"Cannot use --new-column with the '%dyncols:' --formatstr option."
);
}
// if args.flag_cities_url is a number and is 500, 1000, 5000 or 15000,
// its a geonames cities file ID and convert it to a URL
// we do this as a convenience shortcut for users
if args.flag_cities_url.parse::<u16>().is_ok() {
let cities_id = args.flag_cities_url;
// ensure its a valid cities_id - 500, 1000, 5000 or 15000
if cities_id != "500" && cities_id != "1000" && cities_id != "5000" && cities_id != "15000"
{
return fail_incorrectusage_clierror!(
"Invalid --cities-url: {cities_id} - must be one of 500, 1000, 5000 or 15000"
);
}
args.flag_cities_url =
format!("https://download.geonames.org/export/dump/cities{cities_id}.zip");
}
if let Err(err) = Url::parse(&args.flag_cities_url) {
return fail_incorrectusage_clierror!(
"Invalid --cities-url: {url} - {err}",
url = args.flag_cities_url,
err = err
);
}
// we need to use tokio runtime as geosuggest uses async
let rt = tokio::runtime::Runtime::new()?;
rt.block_on(geocode_main(args))?;
Ok(())
}
// main async geocode function that does the actual work
async fn geocode_main(args: Args) -> CliResult<()> {
let mut index_cmd = false;
let mut now_cmd = false;
let geocode_cmd = if args.cmd_suggest {
GeocodeSubCmd::Suggest
} else if args.cmd_reverse {
GeocodeSubCmd::Reverse
} else if args.cmd_countryinfo {
GeocodeSubCmd::CountryInfo
} else if args.cmd_suggestnow {
now_cmd = true;
GeocodeSubCmd::SuggestNow
} else if args.cmd_reversenow {
now_cmd = true;
GeocodeSubCmd::ReverseNow
} else if args.cmd_countryinfonow {
now_cmd = true;
GeocodeSubCmd::CountryInfoNow
} else if args.cmd_index_check {
index_cmd = true;
GeocodeSubCmd::IndexCheck
} else if args.cmd_index_update {
index_cmd = true;
GeocodeSubCmd::IndexUpdate
} else if args.cmd_index_load {
index_cmd = true;
GeocodeSubCmd::IndexLoad
} else if args.cmd_index_reset {
index_cmd = true;
GeocodeSubCmd::IndexReset
} else {
// should not happen as docopt won't allow it
unreachable!();
};
// setup cache directory
let geocode_cache_dir = if let Ok(cache_dir) = std::env::var("QSV_CACHE_DIR") {
// if QSV_CACHE_DIR env var is set, check if it exists. If it doesn't, create it.
if cache_dir.starts_with('~') {
// QSV_CACHE_DIR starts with ~, expand it
// safety: we know it starts with ~, so it should be safe to unwrap
expand_tilde(&cache_dir).unwrap()
} else {
PathBuf::from(cache_dir)
}
} else {
// QSV_CACHE_DIR env var is not set, use args.flag_cache_dir
// first check if it starts with ~, expand it
if args.flag_cache_dir.starts_with('~') {
// safety: we know it starts with ~, so it should be safe to unwrap
expand_tilde(&args.flag_cache_dir).unwrap()
} else {
PathBuf::from(&args.flag_cache_dir)
}
};
if !Path::new(&geocode_cache_dir).exists() {
fs::create_dir_all(&geocode_cache_dir)?;
}
info!("Using cache directory: {}", geocode_cache_dir.display());
let geocode_index_filename = std::env::var("QSV_GEOCODE_INDEX_FILENAME")
.unwrap_or_else(|_| DEFAULT_GEOCODE_INDEX_FILENAME.to_string());
let active_geocode_index_file =
format!("{}/{}", geocode_cache_dir.display(), geocode_index_filename);
let geocode_index_file = args
.arg_index_file
.clone()
.unwrap_or_else(|| active_geocode_index_file.clone());
// create a TempDir for the one record CSV we're creating if we're doing a Now command
// we're doing this at this scope so the TempDir is automatically dropped after we're done
let tempdir = tempfile::Builder::new().prefix("qsv-geocode").tempdir()?;
// we're doing a SuggestNow, ReverseNow or CountryInfoNow - create a one record CSV in tempdir
// with one column named "Location" and the passed location value and use it as the input
let input = if now_cmd {
let tempdir_path = tempdir.path().to_string_lossy().to_string();
let temp_csv_path = format!("{}/{}.csv", tempdir_path, Uuid::new_v4());
let temp_csv_path = Path::new(&temp_csv_path);
let mut temp_csv_wtr = csv::WriterBuilder::new().from_path(temp_csv_path)?;
temp_csv_wtr.write_record(["Location"])?;
temp_csv_wtr.write_record([&args.arg_location])?;
temp_csv_wtr.flush()?;
Some(temp_csv_path.to_string_lossy().to_string())
} else {
args.arg_input
};
let rconfig = Config::new(input.as_ref())
.delimiter(args.flag_delimiter)
.select(SelectColumns::parse(&args.arg_column)?);
// prep progress bar
let show_progress =
(args.flag_progressbar || util::get_envvar_flag("QSV_PROGRESSBAR")) && !rconfig.is_stdin();
let progress = ProgressBar::with_draw_target(None, ProgressDrawTarget::stderr_with_hz(5));
if show_progress {
util::prep_progress(&progress, util::count_rows(&rconfig)?);
} else {
progress.set_draw_target(ProgressDrawTarget::hidden());
}
if index_cmd {
// cities_filename is derived from the cities_url
// the filename is the last component of the URL with a .txt extension
// e.g. https://download.geonames.org/export/dump/cities15000.zip -> cities15000.txt
let cities_filename = args
.flag_cities_url
.split('/')
.last()
.unwrap()
.replace(".zip", ".txt");
// setup languages
let languages_string_vec = args
.flag_languages
.split(',')
.map(|s| s.trim().to_ascii_lowercase())
.collect::<Vec<String>>();
let languages_vec: Vec<&str> = languages_string_vec
.iter()
.map(std::string::String::as_str)
.collect();
info!("geocode_index_file: {geocode_index_file} Languages: {languages_vec:?}");
let indexupdater_settings = IndexUpdaterSettings {
http_timeout_ms: util::timeout_secs(args.flag_timeout)? * 1000,
cities: SourceItem {
url: &args.flag_cities_url,
filename: &cities_filename,
},
names: Some(SourceItem {
url: DEFAULT_CITIES_NAMES_URL,
filename: DEFAULT_CITIES_NAMES_FILENAME,
}),
countries_url: Some(DEFAULT_COUNTRY_INFO_URL),
admin1_codes_url: Some(DEFAULT_ADMIN1_CODES_URL),
admin2_codes_url: Some(DEFAULT_ADMIN2_CODES_URL),
filter_languages: languages_vec.clone(),
};
let updater = IndexUpdater::new(indexupdater_settings.clone())
.map_err(|_| CliError::Other("Error initializing IndexUpdater".to_string()))?;
let storage = storage::bincode::Storage::new();
match geocode_cmd {
// check if Geoname index needs to be updated from the Geonames website
// also returns the index file metadata as JSON
GeocodeSubCmd::IndexCheck => {
winfo!("Checking main Geonames website for updates...");
check_index_file(&geocode_index_file)?;
let metadata = storage
.read_metadata(geocode_index_file)
.map_err(|e| format!("index-check error: {e}"))?;
let index_metadata_json = match serde_json::to_string_pretty(&metadata) {
Ok(json) => json,
Err(e) => {
let json_error = json!({
"errors": [{
"title": "Cannot serialize index metadata to JSON",
"detail": e.to_string()
}]
});
format!("{json_error}")
},
};
let created_at =
util::format_systemtime(metadata.as_ref().unwrap().created_at, "%+");
eprintln!("Created at: {created_at}");
match metadata {
Some(m)
if updater.has_updates(&m).await.map_err(|_| {
CliError::Network("Geonames update check failed.".to_string())
})? =>
{
winfo!(
"Updates available at Geonames.org. Use `qsv geocode index-update` to \
update/rebuild the index.\nPlease use this judiciously as Geonames \
is a free service.\n"
);
},
Some(_) => {
winfo!("Geonames index up-to-date.\n");
},
None => return fail_incorrectusage_clierror!("Invalid Geonames index file."),
}
// print to stdout the index metadata as JSON
// so users can redirect stdout to a JSON file if desired
println!("{index_metadata_json}");
},
GeocodeSubCmd::IndexUpdate => {
// update/rebuild Geonames index from Geonames website
// will only update if there are changes unless --force is specified
check_index_file(&geocode_index_file)?;
let metadata = storage
.read_metadata(geocode_index_file.clone())
.map_err(|e| format!("index-update error: {e}"))?;
if args.flag_force {
winfo!("Forcing fresh build of Geonames index: {geocode_index_file}");
winfo!(
"Using cities URL: {} Languages: {:?}",
args.flag_cities_url,
languages_vec
);
winfo!(
"This will take a while as we need to download data & rebuild the index..."
);
let engine = updater.build().await.map_err(|_| {
CliError::Other("Error building geonames index.".to_string())
})?;
storage
.dump_to(geocode_index_file.clone(), &engine)
.map_err(|e| format!("{e}"))?;
winfo!("Geonames index successfully rebuilt: {geocode_index_file}");
} else {
winfo!("Checking main Geonames website for updates...");
if updater.has_updates(&metadata.unwrap()).await.map_err(|_| {
CliError::Network("Geonames update check failed.".to_string())
})? {
winfo!(
"Updating/Rebuilding Geonames index. This will take a while as we \
need to download data from Geonames & rebuild the index..."
);
let engine = updater.build().await.map_err(|_| {
CliError::Other("Error updating geonames index.".to_string())
})?;
let _ = storage.dump_to(geocode_index_file.clone(), &engine);
winfo!("Updates successfully applied: {geocode_index_file}");
} else {
winfo!("Skipping update. Geonames index is up-to-date.");
}
}
},
GeocodeSubCmd::IndexLoad => {
// load alternate geocode index file
if let Some(index_file) = args.arg_index_file {
winfo!("Validating alternate Geonames index: {index_file}...");
check_index_file(&index_file)?;
let engine = load_engine(index_file.clone().into(), &progress).await?;
// we successfully loaded the alternate geocode index file, so its valid
// copy it to the default geocode index file
if engine.metadata.is_some() {
let _ = storage.dump_to(active_geocode_index_file.clone(), &engine);
winfo!(
"Valid Geonames index file {index_file} successfully copied to \
{active_geocode_index_file}. It will be used from now on or until \
you reset/rebuild it.",
);
} else {
return fail_incorrectusage_clierror!(
"Alternate Geonames index file {index_file} is invalid.",
);
}
} else {
return fail_incorrectusage_clierror!(
"No alternate Geonames index file specified."
);
}
},
GeocodeSubCmd::IndexReset => {
// reset geocode index by deleting the current local copy
// and downloading the default geocode index for the current qsv version
winfo!("Resetting Geonames index to default: {geocode_index_file}...");
fs::remove_file(&geocode_index_file)?;
load_engine(geocode_index_file.clone().into(), &progress).await?;
winfo!("Default Geonames index file successfully reset to {QSV_VERSION} release.");
},
// index_cmd is true, so we should never get a non-index subcommand
_ => unreachable!(),
}
return Ok(());
}
// we're not doing an index subcommand, so we're doing a suggest/now, reverse/now
// or countryinfo/now subcommand. Load the current local Geonames index
let engine = load_engine(geocode_index_file.clone().into(), &progress).await?;
let mut rdr = rconfig.reader()?;
let mut wtr = Config::new(args.flag_output.as_ref())
.quote_style(
// if we're doing a now subcommand with JSON output, we don't want the CSV writer
// to close quote the output as it will produce invalid JSON
if now_cmd && (args.flag_formatstr == "%json" || args.flag_formatstr == "%pretty-json")
{
csv::QuoteStyle::Never
} else {
csv::QuoteStyle::Necessary
},
)
.writer()?;
let headers = rdr.byte_headers()?.clone();
let sel = rconfig.selection(&headers)?;
let column_index = *sel.iter().next().unwrap();
let mut headers = rdr.headers()?.clone();
if let Some(new_name) = args.flag_rename {
let new_col_names = util::ColumnNameParser::new(&new_name).parse()?;
if new_col_names.len() != sel.len() {
return fail_incorrectusage_clierror!(
"Number of new columns does not match input column selection."
);
}
for (i, col_index) in sel.iter().enumerate() {
headers = replace_column_value(&headers, *col_index, &new_col_names[i]);
}
}
// setup output headers
if let Some(new_column) = &args.flag_new_column {
headers.push_field(new_column);
}
// if formatstr starts with "%dyncols:"", then we're using dynfmt to add columns.
// To add columns, we enclose in curly braces a key:value pair for each column with
// the key being the desired column name and the value being the CityRecord field
// we want to add to the CSV
// e.g. "%dyncols: {city_col:name}, {state_col:admin1}, {country_col:country}"
// will add three columns to the CSV named city_col, state_col and country_col.
// first, parse the formatstr to get the column names and values in parallel vectors
let mut column_names = Vec::new();
let mut column_values = Vec::new();
// dyncols_len is the number of columns we're adding in dyncols mode
// it also doubles as a flag to indicate if we're using dyncols mode
// i.e. if dyncols_len > 0, we're using dyncols mode; 0 we're not
let dyncols_len = if args.flag_formatstr.starts_with("%dyncols:") {
for column in args.flag_formatstr[9..].split(',') {
let column = column.trim();
let column_key_value: Vec<&str> = column.split(':').collect();
if column_key_value.len() == 2 {
column_names.push(column_key_value[0].trim_matches('{'));
column_values.push(column_key_value[1].trim_matches('}'));
}
}
// now, validate the column values
// the valid column values are in SORTED_VALID_DYNCOLS
for column_value in &column_values {
if SORTED_VALID_DYNCOLS.binary_search(column_value).is_err() {
return fail_incorrectusage_clierror!(
"Invalid column value: {column_value}. Valid values are: \
{SORTED_VALID_DYNCOLS:?}"