forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathConf.h
800 lines (656 loc) · 23.2 KB
/
Conf.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
// Copyright Matt Wells, Apr 2001
// . every host has a config record
// . like tagdb, record in 100% xml
// . allows remote configuration of hosts through Msg4 class
// . remote user sends some xml, we set our member vars using that xml
// . when we save to disk we convert our mem vars to xml
// . is global so everybody can see it
// . conf record can be changed by director OR with the host's priv key
// . use Conf remotely to get setup info about a specific host
// . get your local ip/port/groupMask/etc. from this class not HostMap
#ifndef _CONF_H_
#define _CONF_H_
//#include "../../rsa/rsa.h" // for private_key and public_key types
#include "Xml.h" // Xml class
#include "File.h" // File class
#include "ip.h" // atoip()
#include "Hostdb.h" // g_hostdb.makeGroupId(),makeGroupMask()
#include "HttpRequest.h"
#include "TcpSocket.h"
#include "Url.h" // MAX_COLL_LEN
#include "Collectiondb.h"
#define MAX_MASTER_IPS 15
#define MAX_MASTER_PASSWORDS 10
#define USERAGENTMAXSIZE 128
#define PASSWORD_MAX_LEN 12
#define MAX_CONNECT_IPS 128
#define AUTOBAN_TEXT_SIZE (32*8192)
#define MAX_DNSIPS 16
#define MAX_RNSIPS 13
#define MAX_MX_LEN 128
#define MAX_EMAIL_LEN 64
#define USERS_TEXT_SIZE 500000
#define MAX_GEOCODERS 4
class Conf {
public:
Conf();
bool isMasterAdmin ( class TcpSocket *s , class HttpRequest *r );
bool isSpamAssassin ( class TcpSocket *s , class HttpRequest *r );
bool isAdminIp ( unsigned long ip );
bool isConnectIp ( unsigned long ip );
// loads conf parms from this file "{dir}/gb.conf"
bool init ( char *dir );
void setRootIps();
// set from a buffer of null-terminated xml
bool add ( char *xml );
// saves any changes to the conf file
bool save ( );
// reset all values to their defaults
void reset();
// verify that some values are ok
bool verify();
// . get the default collection based on hostname
// will look for the hostname in each collection for a match
// no match defaults to default collection
char *getDefaultColl ( char *hostname, long hostnameLen );
// hold the filename of this conf file
char m_confFilename[256];
// general info
//bool m_isTrustedNet;
//char m_dir[256]; // our mattster root working dir
//long m_ip; // now in hostdb conf file
//bool m_isTrusted; // is the whole network trusted?
//private_key m_privKey; // our private key for this host
// max amount of memory we can use
long long m_maxMem;
// if this is false, we do not save, used by dump routines
// in main.cpp so they can change parms here and not worry about
// a core dump saving them
char m_save;
//director info (optional) (used iff m_isTrustedNet is false)
//public_key m_dirPubKey; // everyone should know director's pub key
//private_key m_dirPrivKey; // this is 0 if we don't know it
// . external ip of our firewall/router/...
// . regular users use this to connect
// . Host::m_externalIp/Port is used by admin
// . Host::m_ip/port is for machine to machine communication or
// if admin is coming from a local machine
//unsigned long m_mainExternalIp;
//unsigned short m_mainExternalPort;
// . our group info
//long m_hostId; // our hostId
//long m_numGroups;
//unsigned long m_groupId; // hi bits are set before low bits
//unsigned long m_groupMask; // hi bits are set before low bits
// the main directory
//char m_dir[256];
// an additional strip directory on a different drive
char m_stripeDir[256];
char m_defaultColl [ MAX_COLL_LEN + 1 ];
char m_dirColl [ MAX_COLL_LEN + 1];
char m_dirHost [ MAX_URL_LEN ];
char m_clusterName[32];
// . dns parameters
// . dnsDir should hold our saved cached (TODO: save the dns cache)
//short m_dnsClientPort;
long m_numDns ;
long m_dnsIps[MAX_DNSIPS];
short m_dnsPorts[MAX_DNSIPS];
long m_dnsMaxCacheMem;
bool m_dnsSaveCache;
long m_geocoderIps[MAX_GEOCODERS];
long m_wikiProxyIp;
long m_wikiProxyPort;
// built-in dns parameters using name servers
char m_askRootNameservers;
long m_numRns;
long m_rnsIps[MAX_RNSIPS];
short m_rnsPorts[MAX_RNSIPS];
// log absolute filename
//char m_logFilename[256];
// hostdb absolute conf filename
//char m_hostdbFilename[256];
// used to limit all rdb's to one merge per machine at a time
long m_mergeBufSize;
// tagdb parameters
long m_tagdbMaxTreeMem;
long m_tagdbMaxDiskPageCacheMem;
//long m_tagdbMaxCacheMem;
//bool m_tagdbUseSeals;
//long m_tagdbMinFilesToMerge;
//bool m_tagdbSaveCache;
// catdb parameters
long m_catdbMaxTreeMem;
long m_catdbMaxDiskPageCacheMem;
long m_catdbMaxCacheMem;
long m_catdbMinFilesToMerge;
long m_revdbMaxTreeMem;
long m_timedbMaxTreeMem;
// titledb parameters
//long m_titledbMaxTreeMem; // why isn't this used?
//long m_titledbMaxCacheMem;
//long m_titledbMinFilesToMerge;
//long m_titledbMaxCacheAge;
//bool m_titledbSaveCache;
// clusterdb for site clustering, each rec is 16 bytes
long m_clusterdbMaxTreeMem;
//long m_clusterdbMaxCacheMem;
//long m_clusterdbMaxDiskPageCacheMem;
long m_clusterdbMinFilesToMerge;
bool m_clusterdbSaveCache;
//bool m_indexEventsOnly;
// linkdb for storing linking relations
long m_linkdbMaxTreeMem;
// long m_linkdbMaxCacheMem;
long m_linkdbMaxDiskPageCacheMem;
long m_linkdbMinFilesToMerge;
// bool m_linkdbSaveCache;
// dup vector cache max mem
long m_maxVectorCacheMem;
// checksumdb for doc deduping, each rec is 12-16 bytes
//long m_checksumdbMaxTreeMem;
//long m_checksumdbMaxCacheMem;
//long m_checksumdbMaxDiskPageCacheMem;
//long m_checksumdbMinFilesToMerge;
// size of Checksumdb keys for this host
//long m_checksumdbKeySize;
//bool m_checksumdbSaveCache;
// for holding urls that have been entered into the spider queue
//long m_tfndbMaxTreeMem ;
long m_tfndbMaxDiskPageCacheMem ; // for the DiskPageCache class only
//long m_tfndbMinFilesToMerge;
//bool m_tfndbSaveCache;
//long long m_tfndbMaxUrls;
long m_maxCpuThreads;
long m_deadHostTimeout;
long m_sendEmailTimeout;
long m_pingSpacer;
// the spiderdb holds url records for spidering, when to spider, etc..
long m_maxWriteThreads ;
//long m_spiderdbMaxTreeMem ;
//long m_spiderdbMaxCacheMem ;
//long m_spiderdbMaxDiskPageCacheMem ;
//long m_spiderdbMinFilesToMerge;
long m_spiderMaxDiskThreads ;
long m_spiderMaxBigDiskThreads ; // > 1M read
long m_spiderMaxMedDiskThreads ; // 100k - 1M read
long m_spiderMaxSmaDiskThreads ; // < 100k read
long m_queryMaxDiskThreads ;
long m_queryMaxBigDiskThreads ; // > 1M read
long m_queryMaxMedDiskThreads ; // 100k - 1M read
long m_queryMaxSmaDiskThreads ; // < 100k per read
// categorize the disk read sizes by these here
long m_bigReadSize;
long m_medReadSize;
long m_smaReadSize;
long m_statsdbMaxTreeMem;
long m_statsdbMaxCacheMem;
long m_statsdbMaxDiskPageCacheMem;
//long m_statsdbMinFilesToMerge;
bool m_useStatsdb;
//bool m_statsdbSnapshots;
//bool m_statsdbPageEnabled;
//long m_spiderdbRootUrlPriority; // 0-7
//long m_spiderdbAddUrlPriority ;
//long m_minSpiderPriority ; // min spiderRec priority to spider
//long m_maxSpidersPerDomain ; // per foreign domain
//long m_maxRespiderWait ; // in seconds to re-spider a page
//long m_minRespiderWait ; // in seconds to re-spider a page
// this is now in the root collection record
//long m_maxNumSpiders ; // per local spider host
bool m_spideringEnabled ;
bool m_turkingEnabled ;
//bool m_webSpideringEnabled;
//bool m_facebookSpideringEnabled;
//bool m_stubHubSpideringEnabled;
//bool m_eventBriteSpideringEnabled;
//bool m_refreshFacebookUsersEnabled;
//bool m_injectionEnabled ;
// qa testing loop going on? uses "test" subdir
bool m_testParserEnabled ;
bool m_testSpiderEnabled ;
//bool m_doDocIdRangeSplitting ;
bool m_testSearchEnabled ;
//bool m_spiderLoggingEnabled ;
//bool m_logWarnings ; // generally small problems
//bool m_logCongestion ; // ENOSLOTS
bool m_addUrlEnabled ; // TODO: use at http interface level
bool m_adFeedEnabled ;
//bool m_timingDebugEnabled ;
//bool m_threadDebugEnabled ;
//bool m_httpServerEnabled ;// don't allow seo bots on all machines
bool m_doStripeBalancing ;
// . true if the server is on the production cluster
// . we enforce the 'elvtune -w 32 /dev/sd?' cmd on all drives because
// that yields higher performance when dumping/merging on disk
bool m_isLive;
// is this a buzzlogic cluster?
//bool m_isBuzzLogic;
// is this a wikipedia cluster?
bool m_isWikipedia;
//bool m_spiderLinks ;
//bool m_dedupingEnabled ; // dedup content on same mid domain
//long m_retryNum ; // how many times to retry url b4 nuke
//bool m_useIfModifiedSince ;
//bool m_doUrlSpamCheck ; // disallow urls w/ naughty hostnames
//bool m_timeBetweenUrls ; // for urls from same domain only
// for holding robot.txt files for various hostnames
long m_robotdbMaxCacheMem ;
bool m_robotdbSaveCache;
// indexdb has a max cached age for getting IndexLists (10 mins deflt)
long m_indexdbMaxTreeMem ;
long m_indexdbMaxCacheMem;
long m_indexdbMaxDiskPageCacheMem; // for DiskPageCache class only
long m_indexdbMaxIndexListAge;
long m_indexdbTruncationLimit;
long m_indexdbMinFilesToMerge;
bool m_indexdbSaveCache;
long m_datedbMaxTreeMem ;
long m_datedbMaxCacheMem;
long m_datedbMaxDiskPageCacheMem; // for DiskPageCache class only
long m_datedbMaxIndexListAge;
long m_datedbTruncationLimit;
long m_datedbMinFilesToMerge;
bool m_datedbSaveCache;
// for caching exact quotas in Msg36.cpp
//long m_quotaTableMaxMem;
//bool m_useBuckets;
// port of the main udp server
short m_udpPort;
// TODO: parse these out!!!!
//char m_httpRootDir[256] ;
//short m_httpPort ; now in hosts.conf only
long m_httpMaxSockets ;
long m_httpsMaxSockets ;
//long m_httpMaxReadBufSize ;
long m_httpMaxSendBufSize ;
//long m_httpMaxDownloadSockets ;
// a search results cache (for Msg40)
long m_searchResultsMaxCacheMem ;
long m_searchResultsMaxCacheAge ; // in seconds
bool m_searchResultsSaveCache;
// a sitelinkinfo cache (for Msg25)
long m_siteLinkInfoMaxCacheMem;
long m_siteLinkInfoMaxCacheAge;
bool m_siteLinkInfoSaveCache;
// a sitelinkinfo cache (for MsgD)
long m_siteQualityMaxCacheMem;
long m_siteQualityMaxCacheAge;
bool m_siteQualitySaveCache;
// a sitelinkinfo cache (for Msg25)
// for downloading an rdb
//long m_downloadBufSize; // how big should hosts read buf be?
// . how many incoming links should we sample?
// . used for linkText and quality weighting from number of links
// and their total base quality
long m_maxIncomingLinksToSample;
// phrase weighting
float m_queryPhraseWeight;
// for Weights.cpp
long m_sliderParm;
//long m_indexTableIntersectionAlgo;
// . maxmimum relative weight of a query term (1.0 to inf)
// . default about 8?
//float m_queryMaxMultiplier;
// send emails when a host goes down?
bool m_sendEmailAlerts;
//should we delay when only 1 host goes down out of twins till 9 30 am?
bool m_delayNonCriticalEmailAlerts;
//delay emails after
char m_delayEmailsAfter[6];
//delay emails before
char m_delayEmailsBefore[6];
//bool m_sendEmailAlertsToMattTmobile;
//bool m_sendEmailAlertsToMattAlltell;
//bool m_sendEmailAlertsToJavier;
//bool m_sendEmailAlertsToMelissa;
//bool m_sendEmailAlertsToPartap;
//bool m_sendEmailAlertsToCinco;
bool m_sendEmailAlertsToSysadmin;
//bool m_sendEmailAlertsToZak;
bool m_sendEmailAlertsToEmail1;
char m_email1MX[MAX_MX_LEN];
char m_email1Addr[MAX_EMAIL_LEN];
char m_email1From[MAX_EMAIL_LEN];
bool m_sendEmailAlertsToEmail2;
char m_email2MX[MAX_MX_LEN];
char m_email2Addr[MAX_EMAIL_LEN];
char m_email2From[MAX_EMAIL_LEN];
bool m_sendEmailAlertsToEmail3;
char m_email3MX[MAX_MX_LEN];
char m_email3Addr[MAX_EMAIL_LEN];
char m_email3From[MAX_EMAIL_LEN];
bool m_sendEmailAlertsToEmail4;
char m_email4MX[MAX_MX_LEN];
char m_email4Addr[MAX_EMAIL_LEN];
char m_email4From[MAX_EMAIL_LEN];
//bool m_sendEmailAlertsToSabino;
char m_errstr1[MAX_URL_LEN];
char m_errstr2[MAX_URL_LEN];
char m_errstr3[MAX_URL_LEN];
char m_sendParmChangeAlertsToEmail1;
char m_sendParmChangeAlertsToEmail2;
char m_sendParmChangeAlertsToEmail3;
char m_sendParmChangeAlertsToEmail4;
float m_avgQueryTimeThreshold;
//float m_maxQueryTime;
float m_querySuccessThreshold;
long m_numQueryTimes;
long m_maxCorruptLists;
// limit to how big a serialized query can be before just storing
// the raw string instead, keeps network traffic down at the expense
// of processing time, used by Msg serialization
long m_maxSerializedQuerySize;
// the spider won't go if this bandiwdth rate is currently exceeded
float m_maxIncomingKbps;
// max pgs/sec to index and delete from index. guards resources.
float m_maxPagesPerSecond;
float m_maxLoadAvg;
// redhat 9's NPTL doesn't like our async signals
bool m_allowAsyncSignals;
// if in read-only mode we do no spidering and load no saved trees
// so we can use all mem for caching index lists
bool m_readOnlyMode;
// if this is true we use /etc/hosts for hostname lookup before dns
bool m_useEtcHosts;
bool m_useMergeToken;
// . should we always read data from local machine if available?
// . if your network is not gigabit, this may be a good idea
bool m_preferLocalReads;
// should we bypass load balancing and always send titledb record
// lookup requests to a host to maxmize tfndb page cache hits?
//bool m_useBiasedTfndb;
// calls fsync(fd) if true after each write
bool m_flushWrites ;
bool m_verifyWrites;
long m_corruptRetries;
// log unfreed memory on exit
bool m_detectMemLeaks;
// . if false we will not keep spelling information in memory
// . we will keep the popularity info from dict though, since related
// topics requires that
bool m_doSpellChecking;
// . give suggestions to narrow the search
bool m_doNarrowSearch;
// maximum number of synonyms/stems to expand a word into
//long m_maxSynonyms;
// default affinity for spelling suggestions/numbers
//float m_defaultAffinity;
// threshold for synonym usage
//float m_frequencyThreshold;
// thesaurus configuration
//long m_maxAffinityRequests;
//long m_maxAffinityErrors;
//long m_maxAffinityAge;
//long m_affinityTimeout;
//char m_affinityServer[MAX_URL_LEN];
//char m_affinityParms[MAX_URL_LEN];
// new syncing information
bool m_syncEnabled;
bool m_syncIndexdb;
bool m_syncTitledb;
bool m_syncSpiderdb;
//bool m_syncChecksumdb;
bool m_syncSitedb;
bool m_syncLogging;
bool m_syncDoUnion;
bool m_syncDryRun;
char m_syncHostIds [ 256 ]; // restrict syncing to these host ids
//long m_syncReadBufSize; // limit disk activity for syncing
//long m_syncSeeksPerSecond; // limit disk activity for syncing
long m_syncBytesPerSecond; // limit disk activity for syncing
// if this is true we do not add indexdb keys that *should* already
// be in indexdb. but if you recently upped the m_truncationLimit
// then you can set this to false to add all indexdb keys.
//bool m_onlyAddUnchangedTermIds;
bool m_doIncrementalUpdating;
// always true unless entire indexdb was deleted and we are rebuilding
bool m_indexDeletes;
bool m_splitTwins;
bool m_useThreads;
bool m_useSHM;
bool m_useQuickpoll;
bool m_useDiskPageCacheIndexdb;
bool m_useDiskPageCachePosdb;
bool m_useDiskPageCacheDatedb;
bool m_useDiskPageCacheTitledb;
bool m_useDiskPageCacheSpiderdb;
bool m_useDiskPageCacheTfndb;
bool m_useDiskPageCacheTagdb;
bool m_useDiskPageCacheChecksumdb;
bool m_useDiskPageCacheClusterdb;
bool m_useDiskPageCacheCatdb;
bool m_useDiskPageCacheLinkdb;
//bool m_quickpollCoreOnError;
bool m_useShotgun;
bool m_testMem;
bool m_doConsistencyTesting;
// temporary hack for fixing docid collision resolution bug
bool m_hackFixWords;
bool m_hackFixPhrases;
// flags for excluding docs with only linktext or meta text matches
// for one or more query terms
//bool m_excludeLinkText;
//bool m_excludeMetaText;
// deny robots access to the search results
//bool m_robotCheck;
// scan all titledb files if we can't find the rec where it should be
bool m_scanAllIfNotFound;
// defaults to "Gigabot/1.0"
char m_spiderUserAgent [ USERAGENTMAXSIZE ];
long m_autoSaveFrequency;
long m_docCountAdjustment;
bool m_profilingEnabled;
bool m_dynamicPerfGraph;
long m_minProfThreshold;
bool m_sequentialProfiling;
long m_realTimeProfilerMinQuickPollDelta;
//long m_summaryMode; // JAB: moved to CollectionRec
// . for query-dependent summary/title generation
//long m_titleMaxLen;
//long m_summaryMaxLen;
//long m_summaryMaxNumLines;
//long m_summaryMaxNumCharsPerLine;
//long m_summaryDefaultNumLines;
//char m_summaryFrontHighlightTag[128];
//char m_summaryBackHighlightTag [128];
//
// See Log.h for an explanation of the switches below
//
// GET and POST requests.
bool m_logHttpRequests;
bool m_logAutobannedQueries;
//bool m_logQueryTimes;
// if query took this or more milliseconds, log its time
long m_logQueryTimeThreshold;
bool m_logQueryReply;
bool m_logQueryDebug;
// log what gets into the index
bool m_logSpideredUrls;
// log informational messages, they are not indicative of any error.
bool m_logInfo;
// when out of udp slots
bool m_logNetCongestion;
// doc quota limits, url truncation limits
bool m_logLimits;
// log debug switches
bool m_logDebugAddurl ;
bool m_logDebugAdmin ;
bool m_logDebugBuild ;
bool m_logDebugBuildTime ;
bool m_logDebugDb ;
bool m_logDebugDisk ;
bool m_logDebugDns ;
bool m_logDebugDownloads;
bool m_logDebugFacebook;
bool m_logDebugHttp ;
bool m_logDebugLoop ;
bool m_logDebugLang ;
bool m_logDebugLinkInfo ;
bool m_logDebugMem ;
bool m_logDebugMemUsage;
bool m_logDebugMerge ;
bool m_logDebugNet ;
bool m_logDebugPQR ; // post query rerank
bool m_logDebugQuery ;
bool m_logDebugQuota ;
bool m_logDebugRobots ;
bool m_logDebugSpcache ; // SpiderCache.cpp debug
bool m_logDebugSpeller ;
bool m_logDebugTagdb ;
bool m_logDebugSections;
bool m_logDebugSEO;
bool m_logDebugSEOInserts;
bool m_logDebugStats ;
bool m_logDebugSummary ;
bool m_logDebugSpider ;
bool m_logDebugUrlAttempts ;
bool m_logDebugTcp ;
bool m_logDebugThread ;
bool m_logDebugTimedb ;
bool m_logDebugTitle ;
bool m_logDebugTopics ;
bool m_logDebugTopDocs ;
bool m_logDebugUdp ;
bool m_logDebugUnicode ;
bool m_logDebugRepair ;
bool m_logDebugDate ;
// expensive timing messages
bool m_logTimingAddurl ;
bool m_logTimingAdmin ;
bool m_logTimingBuild;
bool m_logTimingDb;
bool m_logTimingNet;
bool m_logTimingQuery;
bool m_logTimingSpcache;
bool m_logTimingTopics;
// programmer reminders.
bool m_logReminders;
long m_numMasterPwds;
char m_masterPwds[MAX_MASTER_PASSWORDS][PASSWORD_MAX_LEN];
long m_numMasterIps;
long m_masterIps[MAX_MASTER_IPS];
long m_numConnectIps;
long m_connectIps [ MAX_CONNECT_IPS ];
// should we generate similarity/content vector for titleRecs lacking?
// this takes a ~100+ ms, very expensive, so it is just meant for
// testing.
bool m_generateVectorAtQueryTime;
//Users
char m_users [ USERS_TEXT_SIZE ];
long m_usersLen;
char m_superTurks [ USERS_TEXT_SIZE ];
long m_superTurksLen;
long m_maxYippyOut;
char m_doAutoBan;
long m_banIpsLen;
char m_banIps [ AUTOBAN_TEXT_SIZE ];
long m_allowIpsLen;
char m_allowIps [ AUTOBAN_TEXT_SIZE ];
long m_validCodesLen;
char m_validCodes[ AUTOBAN_TEXT_SIZE ];
long m_banRegexLen;
char m_banRegex [ AUTOBAN_TEXT_SIZE ];
long m_extraParmsLen;
char m_extraParms [ AUTOBAN_TEXT_SIZE ];
unsigned char m_numFreeQueriesPerMinute;
unsigned long m_numFreeQueriesPerDay;
char m_redirect[MAX_URL_LEN];
char m_useCompressionProxy;
char m_gzipDownloads;
// used by proxy to make proxy point to the temp cluster while
// the original cluster is updated
char m_useTmpCluster;
char m_timeSyncProxy;
// For remote datafeed verification
//char m_useDFAcctServer;
//long m_dfAcctIp;
//long m_dfAcctPort;
//char m_dfAcctColl[MAX_COLL_LEN];
Xml m_xml;
char m_buf[10*1024];
long m_bufSize;
// . for specifying if this is an interface machine
// messages are rerouted from this machine to the main
// cluster set in the hosts.conf.
bool m_interfaceMachine;
// after we take the natural log of each query term's DF (doc freq.)
// we
float m_queryExp;
//char m_useDynamicPhraseWeighting;
float m_minPopForSpeller; // 0% to 100%
// catdb min site rec size for LARGE but latent domains
long m_catdbMinRecSizes;
// allow scaling up of hosts by removing recs not in the correct
// group. otherwise a sanity check will happen.
char m_allowScale;
// . timeout on dead hosts, only set when we know a host is dead and
// will not come back online. Messages will timeout on the dead
// host, but not error, allowing outstanding spidering to finish
// to the twin
char m_giveupOnDeadHosts;
char m_bypassValidation;
long m_maxHardDriveTemp;
long m_maxHeartbeatDelay;
long m_maxCallbackDelay;
// balance value for Msg6, each host can have this many ready domains
// per global host
//long m_distributedSpiderBalance;
//long m_distributedIpWait;
// parameters for indexdb spitting and tfndb extension bits
//long m_indexdbSplit;
//char m_fullSplit;
//char m_legacyIndexdbSplit;
//long m_tfndbExtBits;
// used by Repair.cpp
char m_repairingEnabled ;
long m_maxRepairSpiders ;
long m_repairMem;
char m_collsToRepair[1024];
char m_fullRebuild ;
char m_fullRebuildKeepNewSpiderRecs;
char m_rebuildRecycleLinkInfo ;
//char m_rebuildRecycleLinkInfo2 ;
//char m_removeBadPages ;
char m_rebuildTitledb ;
//char m_rebuildTfndb ;
//char m_rebuildIndexdb ;
char m_rebuildPosdb ;
//char m_rebuildNoSplits ;
//char m_rebuildDatedb ;
//char m_rebuildChecksumdb ;
char m_rebuildClusterdb ;
char m_rebuildSpiderdb ;
//char m_rebuildSitedb ;
char m_rebuildLinkdb ;
//char m_rebuildTagdb ;
//char m_rebuildPlacedb ;
char m_rebuildTimedb ;
char m_rebuildSectiondb ;
//char m_rebuildRevdb ;
char m_rebuildRoots ;
char m_rebuildNonRoots ;
char m_rebuildSkipSitedbLookup ;
// for caching the qualities of urls (see Msg20.cpp)
long m_maxQualityCacheAge ;
};
extern class Conf g_conf;
#endif
// old stuff:
// key is the hostId. hostId of -1 is the default conf record.
// here's the recognized fields:
// <dirPubKey> // default rec only
// <groupMask> // default rec only
// <rootDir> // default rec only
// <numPolice> // default rec only
// <isTrustedNet> // default rec only
// <hostId> -- stored in hostmap
// <ip> -- stored in hostmap
// <port> -- stored in hostmap
// <networkName> // also in default rec
// <maxDiskSpace>
// <maxMem>
// <maxCpu>
// <maxBps>
// <pubKey>
// <isTrustedHost> // director sealed