Skip to content

Commit

Permalink
first commit, add class TiHtmBase
Browse files Browse the repository at this point in the history
  • Loading branch information
luoxn28 committed Sep 29, 2015
0 parents commit 33ce9a9
Show file tree
Hide file tree
Showing 6 changed files with 380 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
htmtest.cpp
htmtest.html

16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
objects=tinyhtm.o tinyhtmparser.o htmtest.o

htmtest: $(objects)
g++ -o htmtest $(objects)

htmtest.o: htmtest.cpp
g++ -c htmtest.cpp

tinyhtm.o: tinyhtm.h tinyhtm.cpp
g++ -c tinyhtm.cpp

tinyhtmparser.o: tinyhtm.h tinyhtmparser.cpp
g++ -c tinyhtmparser.cpp

clean:
rm $(objects)
1 change: 1 addition & 0 deletions readme
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a simple html file parser.
72 changes: 72 additions & 0 deletions tinyhtm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#include "tinyhtm.h"

const char *TIHTMVERSION = "intyhtml-0.1";

const char *TiHtmVersion()
{
return TIHTMVERSION;
}
static FILE *TiHtmOpen(const char *filename, const char *mode)
{
return fopen(filename, mode);
}

// the scope of class TiHtmBase

// the scope of class TiHtmDocument

bool TiHtmDocument::loadFile()
{
return loadFile(value.c_str());
}

bool TiHtmDocument::loadFile(std::string filename)
{
return loadFile(filename.c_str());
}

bool TiHtmDocument::loadFile(const char *filename)
{
if (!filename && value.empty())
{
TiHtmCout << "filename is null && value is empty" << TiHtmEndl;
exit(-1);
}

FILE *file = TiHtmOpen(filename, "rb");

if (file)
{
bool result = loadFile(file);
fclose(file);
return result;
}
else
{
return false;
}
}

bool TiHtmDocument::loadFile(FILE *file)
{
if (!file)
exit(-1);

long length = 0;
fseek(file, 0, SEEK_END);
length = ftell(file);
fseek(file, 0, SEEK_SET);

char *buf = new char[length + 1];

if (fread(buf, length, 1, file) != 1)
{
exit (-1);
}
buf[length] = '\0';

TiHtmCout << buf << TiHtmEndl;

delete []buf;
}

111 changes: 111 additions & 0 deletions tinyhtm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#ifndef TINYHTM_H
#define TINYHTM_H

#include <iostream>
#include <string>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#define TiHtmCout std::cout
#define TiHtmCin std::cin
#define TiHtmEndl std::endl


/// record the location
struct TiHtmCursor
{
TiHtmCursor() { clear(); }
void clear() { row = column = -1; }

int row;
int column;
};


/// the base class of whole tinyhtml
class TiHtmBase
{
public:
TiHtmBase() {}
virtual ~TiHtmBase() {}

virtual void print() const = 0;
virtual const char *parse(const char *p) = 0;

int row() const { return location.row; }
int column() const { return location.row; }

//static void encodeString(const char *str, char *out); // not write code

protected:
static bool isWhiteSpace(char c);
static bool isWhiteSpace(int c)
{
if (c < 256)
return isWhiteSpace((char)c);
return false;
}
static const char *skipWhiteSpace(const char *p);

static const char *readName(const char *p, std::string *name);
static const char *readText(const char *p, std::string *text, bool trimWhiteSpace, const char *endTag, bool ignoreCase);
/// 把转义字符转化为原来的字符,比如"&lt;"转化为 '<'
static const char *getEntity(const char *p, char *value);
/// 从输入流中获取一个字符,有可能转换转义字符
static const char *getChar(const char *p, char *value);

/// if tge is p's prefix, return true, else return false
static bool stringEqual(const char *p, const char *tag, bool ignoreCase);

// record the location message
TiHtmCursor location;

static int isAlpha(unsigned char c);
static int isAlnum(unsigned char c);
static int toLower(int c) { return tolower(c); }

private:
TiHtmBase(const TiHtmBase &); // not allowd
void operator=(const TiHtmBase &base); // not allowd

struct Entity
{
const char *str;
unsigned int strLength;
const char chr;
};
enum
{
NUM_ENTITY = 5,
MAX_ENTITY_LENGTH = 6
};

static Entity entity[NUM_ENTITY]; // defined in file tinyhtmparser.cpp
};


class TiHtmDocument
{
public:
TiHtmDocument() : value("") {}
TiHtmDocument(std::string filename) : value(filename) {}

bool loadFile();
bool loadFile(const char *filename);
bool loadFile(const std::string filename);
bool loadFile(FILE *file);

const std::string getValue() const { return value; }
const char *getValueCstr()const { return value.c_str(); }

private:
// This should not in here, it will in TiHtmNode
std::string value;
};

#endif

177 changes: 177 additions & 0 deletions tinyhtmparser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#include "tinyhtm.h"

// 定义的转义字符数组
TiHtmBase::Entity TiHtmBase::entity[TiHtmBase::NUM_ENTITY] =
{
{"&amp;", 5, '&'},
{"&lt;", 4, '<'},
{"&gt;", 4, '>'},
{"&quot;", 6, '\"'},
{"&apos;", 6, '\''}
};


// the scope of class TiHtmBase

bool TiHtmBase::isWhiteSpace(char c)
{
return (isspace((unsigned char)c) || c == '\n' || c == '\r');
}

const char *TiHtmBase::skipWhiteSpace(const char *p)
{
while (p && *p && isWhiteSpace(*p))
{
p++;
}
return p;
}

const char *TiHtmBase::readName(const char *p, std::string *name)
{
*name = "";
assert(p);

if (*p && (isAlpha(*p) || *p == '_'))
{
const char *pstart = p;
while (*p && (isAlpha(*p) || *p == '_' || *p == '-' || *p == '.' || *p == ':'))
{
p++;
}
if (p - pstart > 0)
{
name->assign(pstart, p - pstart);
}
return p;
}

return NULL;
}

const char *TiHtmBase::readText(const char *p, std::string *text, bool trimWhiteSpace, const char *endTag, bool ignoreCase)
{
*text = "";
assert(p);

if (!trimWhiteSpace)
{
while (*p && !stringEqual(p, endTag, ignoreCase))
{
char chr;
p = getChar(p, &chr);
text->append(1, chr);
}
}
else
{
bool whiteSpace = false;

p = skipWhiteSpace(p);
while (*p && !stringEqual(p, endTag, ignoreCase))
{
if (isWhiteSpace(*p))
{
whiteSpace = true;
p++;
}
else
{
if (whiteSpace)
{
text->append(1, ' ');
whiteSpace = false;
}

char chr;
p = getChar(p, &chr);
text->append(1, chr);
}
}
}

if (*p)
p += strlen(endTag);

return (*p) ? p : NULL;
}

const char* TiHtmBase::getEntity(const char *p, char *value)
{
for (int i = 0; i < NUM_ENTITY; i++)
{
if (strncmp(p, entity[i].str, entity[i].strLength) == 0)
{
*value = entity[i].chr;
return (p + entity[i].strLength);
}
}

*value = *p;
return (p + 1);
}

const char *TiHtmBase::getChar(const char *p, char *value)
{
if (*p == '&')
{
return getEntity(p, value);
}
else
{
*value = *p;
return (p + 1);
}
}

/// if tag is p's prefix, return true, else return false
bool TiHtmBase::stringEqual(const char *p, const char *tag, bool ignoreCase)
{
if (!p || !*p)
{
assert(0);
return false;
}

const char *pstart = p;

if (ignoreCase)
{
while (*pstart && *tag && toLower(*pstart) == toLower(*tag))
{
pstart++;
tag++;
}

if (!*tag)
return true;
}
else
{
while (*pstart && *tag && *pstart == *tag)
{
pstart++;
tag++;
}

if (!*tag)
return true;
}
return false;
}

int TiHtmBase::isAlpha(unsigned char c)
{
if (c < 127)
return isalpha(c);
else
return false;
}

int TiHtmBase::isAlnum(unsigned char c)
{
if (c < 127)
return isalnum(c);
else
return false;
}

0 comments on commit 33ce9a9

Please sign in to comment.