Skip to content

Commit

Permalink
WordSmoker: computes entropy of 16-bit words (1 GB/s)
Browse files Browse the repository at this point in the history
  • Loading branch information
Bulat-Ziganshin committed Feb 8, 2014
1 parent 01e3e91 commit a1d51a2
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 6 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ Since already compressed, text and multimedia files are better compressed with s
This project will provide various experimental algorithms that can recognize some of special datatypes (not necessary all), as well as samples of data that are especially hard to smoke correctly.


The full list of algorithms:
The full list of smells:

- ByteDistribution: computes entropy of individual bytes (2 GB/s).
- WordDistribution: computes entropy of 16-bit words.
- DWordDistribution: computes entropy of 32-bit dwords (3 GB/s).
- ByteSmoker: computes entropy of individual bytes (2 GB/s).
- WordSmoker: computes entropy of 16-bit words (1 GB/s).
- DWordSmoker: computes entropy of 32-bit dwords (3 GB/s).
39 changes: 37 additions & 2 deletions smoke.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,41 @@ void ByteSmoker::smoke (void *buf, size_t bufsize, double *entropy)
{
size_t count = count1[i] + count2[i] + count3[i] + count4[i];
if (count)
order0 += count * log(double(bufsize/count))/log(double(2)) / 8;
order0 += count * log(double(bufsize)/count)/log(double(2)) / 8;
}

*entropy = order0 / bufsize;
}


/**************************************************************************/
/* Word smoker: calculate compression ratio with the 16-bit order-0 model */
/**************************************************************************/

class WordSmoker : public Smoker
{
uint32_t *count;
size_t bits[256];
public:
WordSmoker() {count = new uint32_t[256*256];}
virtual const char* name() {return "WordSmoker";};
virtual ~WordSmoker() {delete[] count;}
virtual void smoke (void *buf, size_t bufsize, double *entropy);
};

void WordSmoker::smoke (void *buf, size_t bufsize, double *entropy)
{
memset (count, 0, 256*256*sizeof(*count));

byte *p = (byte*) buf;
for (int i=0; i<bufsize-1; i++)
count[ *(uint16_t*)(p+i) ]++;

double order0 = 0;
for (int i=0; i<256*256; i++)
{
if (count[i])
order0 += count[i] * log(double(bufsize)/count[i])/log(double(2)) / 16;
}

*entropy = order0 / bufsize;
Expand Down Expand Up @@ -154,8 +188,9 @@ int main (int argc, char **argv)
fprintf(stderr, "%sProcessing %s: ", file>1?"\n":"", argv[file]);

ByteSmoker ByteS;
WordSmoker WordS;
DWordSmoker DWordS;
Smoker *smokers[] = {&ByteS, &DWordS};
Smoker *smokers[] = {&ByteS, &WordS, &DWordS};
const int NumSmokers = sizeof(smokers)/sizeof(*smokers);
double entropy, min_entropy[NumSmokers], avg_entropy[NumSmokers] = {0}, max_entropy[NumSmokers] = {0};
for (int i=0; i<NumSmokers; ++i) min_entropy[i] = 1;
Expand Down

0 comments on commit a1d51a2

Please sign in to comment.