Skip to content
This repository has been archived by the owner on Dec 25, 2017. It is now read-only.

Commit

Permalink
Added support of searching in utf16 little endian encoded files.
Browse files Browse the repository at this point in the history
It works in a way similar to searching in compressed files -- i. e., creates a buffer in memory, converts the searched unicode file to utf8 and stores the conversion result into the created buffer.
  • Loading branch information
nik committed Dec 11, 2015
1 parent 5575e07 commit af65e4e
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 0 deletions.
66 changes: 66 additions & 0 deletions src/encoding.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include "encoding.h"
#include "util.h"

#include <strsafe.h>
#include <windows.h>

/*
http://stackoverflow.com/questions/3082620/convert-utf-16-to-utf-8
*/
char * convert_utf16_to_utf8(const char *buf, int *utf8_len) {
// Get WCHAR's count corresponding to total input string length
const size_t UTF16_LEN_MAX = INT_MAX - 1;
size_t utf16_len;

HRESULT hr = StringCchLengthW((STRSAFE_PCNZWCH)buf, UTF16_LEN_MAX, &utf16_len);
if (FAILED(hr))
{
DWORD dwError = GetLastError();
die("convert_utf16_to_utf8: StringCchLengthW failed - hr = 0x%X - LastError == 0x%X.", hr, dwError);
}

// Count the terminating \0
++utf16_len;

// Get the size of destination UTF-8 buffer, in CHAR's (= bytes)
*utf8_len = ::WideCharToMultiByte(
CP_UTF8, // convert to UTF-8
0, // specify conversion behavior
(LPCWCH)buf, // source UTF-16 string
(int)utf16_len, // total source string length, in WCHAR's,
// including end-of-string \0
NULL, // unused - no conversion required in this step
0, // request buffer size
NULL, // unsued
NULL // unused
);

if ((*utf8_len) == 0)
{
DWORD dwError = GetLastError();
die("convert_utf16_to_utf8: WideCharToMultiByte (calculating size) failed - LastError == 0x%X.", dwError);
}

// Allocate the destination buffer for UTF-8 string
char *_buf = (char *)malloc(*utf8_len);

// Convert from UTF-16 to UTF-8
int result = ::WideCharToMultiByte(
CP_UTF8, // convert to UTF-8
0, // specify conversion behavior
(LPCWCH)buf, // source UTF-16 string
(int)utf16_len, // total source string length, in WCHAR's,
// including end-of-string \0
_buf, // destination buffer
*utf8_len, // destination buffer size, in bytes
NULL, NULL // unused
);

if (result == 0)
{
DWORD dwError = GetLastError();
die("convert_utf16_to_utf8: WideCharToMultiByte (conversion) failed - LastError == 0x%X.", dwError);
}

return _buf;
}
6 changes: 6 additions & 0 deletions src/encoding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#ifndef ENCODING_H
#define ENCODING_H

char * convert_utf16_to_utf8(const char *buf, int *utf8_len);

#endif
15 changes: 15 additions & 0 deletions src/search.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
#include "util.h"
#include "scandir.h"

#ifdef _WIN32
#include "encoding.h"
#endif


size_t alpha_skip_lookup[256];
size_t *find_skip_lookup;
Expand Down Expand Up @@ -297,6 +301,17 @@ void search_file(const char *file_full_path) {
}
}

#ifdef _WIN32
if (is_utf16le(buf, f_len))
{
int utf8_len = 0;
char *_buf = convert_utf16_to_utf8(buf, &utf8_len);
search_buf(_buf, utf8_len, file_full_path);
free(_buf);
goto cleanup;
}
#endif

search_buf(buf, f_len, file_full_path);

cleanup:
Expand Down
19 changes: 19 additions & 0 deletions src/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,25 @@ void compile_study(pcre **re, pcre_extra **re_extra, char *q, const int pcre_opt
}
}

/* This function is very hot. It's called on every file. */
int is_utf16le(const void *buf, const int buf_len) {
/* Utf magic numbers
* 00 00 FE FF UTF-32, big-endian
* FF FE 00 00 UTF-32, little-endian
* FE FF UTF-16, big-endian
* FF FE UTF-16, little-endian
* EF BB BF UTF-8
*/

const unsigned char *buf_c = (const unsigned char *)buf;

if (buf_len >= 2 && buf_c[0] == 0xFF && buf_c[1] == 0xFE) {
return 1;
}

return 0;
}

/* This function is very hot. It's called on every file. */

int is_binary(const char* buf, const size_t buf_len) {
Expand Down
1 change: 1 addition & 0 deletions src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ void compile_study(pcre **re, pcre_extra **re_extra, char *q, const int pcre_opt
void *decompress(const ag_compression_type zip_type, const void *buf, const int buf_len, const char *dir_full_path, int *new_buf_len);
ag_compression_type is_zipped(const void *buf, const int buf_len);

int is_utf16le(const void *buf, const int buf_len);
int is_binary(const char *buf, const size_t buf_len);
int is_regex(const char *query);
int is_fnmatch(const char *filename);
Expand Down
2 changes: 2 additions & 0 deletions vs2013/ag.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="..\src\decompress.h" />
<ClInclude Include="..\src\encoding.h" />
<ClInclude Include="..\src\ignore.h" />
<ClInclude Include="..\src\lang.h" />
<ClInclude Include="..\src\log.h" />
Expand All @@ -189,6 +190,7 @@
<ItemGroup>
<ClCompile Include="..\src\decompress.c">
</ClCompile>
<ClCompile Include="..\src\encoding.c" />
<ClCompile Include="..\src\ignore.c">
</ClCompile>
<ClCompile Include="..\src\lang.c">
Expand Down
6 changes: 6 additions & 0 deletions vs2013/ag.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
<ClInclude Include="..\src\decompress.h">
<Filter>src</Filter>
</ClInclude>
<ClInclude Include="..\src\encoding.h">
<Filter>src</Filter>
</ClInclude>
<ClInclude Include="..\src\ignore.h">
<Filter>src</Filter>
</ClInclude>
Expand Down Expand Up @@ -53,6 +56,9 @@
<ClCompile Include="..\src\decompress.c">
<Filter>src</Filter>
</ClCompile>
<ClCompile Include="..\src\encoding.c">
<Filter>src</Filter>
</ClCompile>
<ClCompile Include="..\src\ignore.c">
<Filter>src</Filter>
</ClCompile>
Expand Down

0 comments on commit af65e4e

Please sign in to comment.