Skip to content

Commit

Permalink
Merge pull request ggreer#105 from ggreer/utf8_detection
Browse files Browse the repository at this point in the history
Detect utf8 files correctly. Well, at least anything in the BMP.
  • Loading branch information
ggreer committed Nov 7, 2012
2 parents 686e895 + 50e2c1e commit 2858df3
Showing 1 changed file with 26 additions and 7 deletions.
33 changes: 26 additions & 7 deletions src/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,21 +168,40 @@ int is_binary(const void* buf, const int buf_len) {
return 0;
}

if (buf_len >= 3 && buf_c[0] == 0xEF && buf_c[1] == 0xBB && buf_c[2] == 0xBF) {
/* UTF-8 BOM. This isn't binary. */
return 0;
}

for (i = 0; i < total_bytes; i++) {
/* Disk IO is so slow that it's worthwhile to do this calculation after every suspicious byte. */
/* This is true even on a 1.6Ghz Atom with an Intel 320 SSD. */
/* Read at least 32 bytes before making a decision */
if (i >= 32 && (suspicious_bytes * 100) / total_bytes > 10) {
return 1;
}

if (buf_c[i] == '\0') {
/* NULL char. It's binary */
return 1;
} else if ((buf_c[i] < 7 || buf_c[i] > 14) && (buf_c[i] < 32 || buf_c[i] > 127)) {
suspicious_bytes++;
/* Disk IO is so slow that it's worthwhile to do this calculation after every suspicious byte. */
/* This is true even on a 1.6Ghz Atom with an Intel 320 SSD. */
/* Read at least 32 bytes before making a decision */
if (i > 32 && (suspicious_bytes * 100) / total_bytes > 20) {
return 1;
/* UTF-8 detection */
if (buf_c[i] > 191 && buf_c[i] < 224 && i + 1 < total_bytes) {
i++;
if (buf_c[i] < 192) {
continue;
}
} else if (buf_c[i] > 223 && buf_c[i] < 239 && i + 2 < total_bytes) {
i++;
if (buf_c[i] < 192 && buf_c[i + 1] < 192) {
i++;
continue;
}
}
suspicious_bytes++;
}
}
if ((suspicious_bytes * 100) / total_bytes > 20) {
if ((suspicious_bytes * 100) / total_bytes > 10) {
return 1;
}

Expand Down

0 comments on commit 2858df3

Please sign in to comment.