Skip to content

Commit

Permalink
use better regexp matching in tag stats, closes old issue 384
Browse files Browse the repository at this point in the history
  • Loading branch information
grandsbor committed Sep 27, 2014
1 parent fb88f67 commit 60669bd
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion scripts/stats/update_tag_stats.pl
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@
my $ref;
my $prefix;

my $cyr_cp1251 = encode('cp1251', '[а-яё]');
my $cyr_match = "CONVERT(tf_text USING cp1251) COLLATE 'cp1251_general_ci' REGEXP '$cyr_cp1251' COLLATE 'cp1251_general_ci'";

my $dbh = DBI->connect('DBI:mysql:'.$conf->{'dbname'}.':'.$conf->{'host'}, $conf->{'user'}, $conf->{'passwd'}) or die $DBI::errstr;
$dbh->do("SET NAMES utf8");
my $scan_books = $dbh->prepare("SELECT book_id, tag_name FROM book_tags WHERE tag_name NOT LIKE 'url:%' AND tag_name NOT LIKE 'Дата:%' ORDER BY book_id");
my $count_words = $dbh->prepare("SELECT COUNT(*) AS cnt FROM tokens WHERE sent_id IN (SELECT sent_id FROM sentences WHERE par_id IN (SELECT par_id FROM paragraphs WHERE book_id=?)) AND tf_text REGEXP '[А-Яа-яЁё]'");
my $count_words = $dbh->prepare("SELECT COUNT(*) AS cnt FROM tokens WHERE sent_id IN (SELECT sent_id FROM sentences WHERE par_id IN (SELECT par_id FROM paragraphs WHERE book_id=?)) AND $cyr_match");
my $drop = $dbh->prepare("TRUNCATE TABLE tag_stats");
my $ins = $dbh->prepare("INSERT INTO tag_stats VALUES(?, ?, ?, ?)");

Expand Down

0 comments on commit 60669bd

Please sign in to comment.