forked from luoxn28/tinyhtml
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 33ce9a9
Showing
6 changed files
with
380 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
htmtest.cpp | ||
htmtest.html | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
objects=tinyhtm.o tinyhtmparser.o htmtest.o | ||
|
||
htmtest: $(objects) | ||
g++ -o htmtest $(objects) | ||
|
||
htmtest.o: htmtest.cpp | ||
g++ -c htmtest.cpp | ||
|
||
tinyhtm.o: tinyhtm.h tinyhtm.cpp | ||
g++ -c tinyhtm.cpp | ||
|
||
tinyhtmparser.o: tinyhtm.h tinyhtmparser.cpp | ||
g++ -c tinyhtmparser.cpp | ||
|
||
clean: | ||
rm $(objects) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
This is a simple html file parser. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#include "tinyhtm.h" | ||
|
||
const char *TIHTMVERSION = "intyhtml-0.1"; | ||
|
||
const char *TiHtmVersion() | ||
{ | ||
return TIHTMVERSION; | ||
} | ||
static FILE *TiHtmOpen(const char *filename, const char *mode) | ||
{ | ||
return fopen(filename, mode); | ||
} | ||
|
||
// the scope of class TiHtmBase | ||
|
||
// the scope of class TiHtmDocument | ||
|
||
bool TiHtmDocument::loadFile() | ||
{ | ||
return loadFile(value.c_str()); | ||
} | ||
|
||
bool TiHtmDocument::loadFile(std::string filename) | ||
{ | ||
return loadFile(filename.c_str()); | ||
} | ||
|
||
bool TiHtmDocument::loadFile(const char *filename) | ||
{ | ||
if (!filename && value.empty()) | ||
{ | ||
TiHtmCout << "filename is null && value is empty" << TiHtmEndl; | ||
exit(-1); | ||
} | ||
|
||
FILE *file = TiHtmOpen(filename, "rb"); | ||
|
||
if (file) | ||
{ | ||
bool result = loadFile(file); | ||
fclose(file); | ||
return result; | ||
} | ||
else | ||
{ | ||
return false; | ||
} | ||
} | ||
|
||
bool TiHtmDocument::loadFile(FILE *file) | ||
{ | ||
if (!file) | ||
exit(-1); | ||
|
||
long length = 0; | ||
fseek(file, 0, SEEK_END); | ||
length = ftell(file); | ||
fseek(file, 0, SEEK_SET); | ||
|
||
char *buf = new char[length + 1]; | ||
|
||
if (fread(buf, length, 1, file) != 1) | ||
{ | ||
exit (-1); | ||
} | ||
buf[length] = '\0'; | ||
|
||
TiHtmCout << buf << TiHtmEndl; | ||
|
||
delete []buf; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#ifndef TINYHTM_H | ||
#define TINYHTM_H | ||
|
||
#include <iostream> | ||
#include <string> | ||
|
||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
#include <ctype.h> | ||
#include <assert.h> | ||
|
||
#define TiHtmCout std::cout | ||
#define TiHtmCin std::cin | ||
#define TiHtmEndl std::endl | ||
|
||
|
||
/// record the location | ||
struct TiHtmCursor | ||
{ | ||
TiHtmCursor() { clear(); } | ||
void clear() { row = column = -1; } | ||
|
||
int row; | ||
int column; | ||
}; | ||
|
||
|
||
/// the base class of whole tinyhtml | ||
class TiHtmBase | ||
{ | ||
public: | ||
TiHtmBase() {} | ||
virtual ~TiHtmBase() {} | ||
|
||
virtual void print() const = 0; | ||
virtual const char *parse(const char *p) = 0; | ||
|
||
int row() const { return location.row; } | ||
int column() const { return location.row; } | ||
|
||
//static void encodeString(const char *str, char *out); // not write code | ||
|
||
protected: | ||
static bool isWhiteSpace(char c); | ||
static bool isWhiteSpace(int c) | ||
{ | ||
if (c < 256) | ||
return isWhiteSpace((char)c); | ||
return false; | ||
} | ||
static const char *skipWhiteSpace(const char *p); | ||
|
||
static const char *readName(const char *p, std::string *name); | ||
static const char *readText(const char *p, std::string *text, bool trimWhiteSpace, const char *endTag, bool ignoreCase); | ||
/// 把转义字符转化为原来的字符,比如"<"转化为 '<' | ||
static const char *getEntity(const char *p, char *value); | ||
/// 从输入流中获取一个字符,有可能转换转义字符 | ||
static const char *getChar(const char *p, char *value); | ||
|
||
/// if tge is p's prefix, return true, else return false | ||
static bool stringEqual(const char *p, const char *tag, bool ignoreCase); | ||
|
||
// record the location message | ||
TiHtmCursor location; | ||
|
||
static int isAlpha(unsigned char c); | ||
static int isAlnum(unsigned char c); | ||
static int toLower(int c) { return tolower(c); } | ||
|
||
private: | ||
TiHtmBase(const TiHtmBase &); // not allowd | ||
void operator=(const TiHtmBase &base); // not allowd | ||
|
||
struct Entity | ||
{ | ||
const char *str; | ||
unsigned int strLength; | ||
const char chr; | ||
}; | ||
enum | ||
{ | ||
NUM_ENTITY = 5, | ||
MAX_ENTITY_LENGTH = 6 | ||
}; | ||
|
||
static Entity entity[NUM_ENTITY]; // defined in file tinyhtmparser.cpp | ||
}; | ||
|
||
|
||
class TiHtmDocument | ||
{ | ||
public: | ||
TiHtmDocument() : value("") {} | ||
TiHtmDocument(std::string filename) : value(filename) {} | ||
|
||
bool loadFile(); | ||
bool loadFile(const char *filename); | ||
bool loadFile(const std::string filename); | ||
bool loadFile(FILE *file); | ||
|
||
const std::string getValue() const { return value; } | ||
const char *getValueCstr()const { return value.c_str(); } | ||
|
||
private: | ||
// This should not in here, it will in TiHtmNode | ||
std::string value; | ||
}; | ||
|
||
#endif | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
#include "tinyhtm.h" | ||
|
||
// 定义的转义字符数组 | ||
TiHtmBase::Entity TiHtmBase::entity[TiHtmBase::NUM_ENTITY] = | ||
{ | ||
{"&", 5, '&'}, | ||
{"<", 4, '<'}, | ||
{">", 4, '>'}, | ||
{""", 6, '\"'}, | ||
{"'", 6, '\''} | ||
}; | ||
|
||
|
||
// the scope of class TiHtmBase | ||
|
||
bool TiHtmBase::isWhiteSpace(char c) | ||
{ | ||
return (isspace((unsigned char)c) || c == '\n' || c == '\r'); | ||
} | ||
|
||
const char *TiHtmBase::skipWhiteSpace(const char *p) | ||
{ | ||
while (p && *p && isWhiteSpace(*p)) | ||
{ | ||
p++; | ||
} | ||
return p; | ||
} | ||
|
||
const char *TiHtmBase::readName(const char *p, std::string *name) | ||
{ | ||
*name = ""; | ||
assert(p); | ||
|
||
if (*p && (isAlpha(*p) || *p == '_')) | ||
{ | ||
const char *pstart = p; | ||
while (*p && (isAlpha(*p) || *p == '_' || *p == '-' || *p == '.' || *p == ':')) | ||
{ | ||
p++; | ||
} | ||
if (p - pstart > 0) | ||
{ | ||
name->assign(pstart, p - pstart); | ||
} | ||
return p; | ||
} | ||
|
||
return NULL; | ||
} | ||
|
||
const char *TiHtmBase::readText(const char *p, std::string *text, bool trimWhiteSpace, const char *endTag, bool ignoreCase) | ||
{ | ||
*text = ""; | ||
assert(p); | ||
|
||
if (!trimWhiteSpace) | ||
{ | ||
while (*p && !stringEqual(p, endTag, ignoreCase)) | ||
{ | ||
char chr; | ||
p = getChar(p, &chr); | ||
text->append(1, chr); | ||
} | ||
} | ||
else | ||
{ | ||
bool whiteSpace = false; | ||
|
||
p = skipWhiteSpace(p); | ||
while (*p && !stringEqual(p, endTag, ignoreCase)) | ||
{ | ||
if (isWhiteSpace(*p)) | ||
{ | ||
whiteSpace = true; | ||
p++; | ||
} | ||
else | ||
{ | ||
if (whiteSpace) | ||
{ | ||
text->append(1, ' '); | ||
whiteSpace = false; | ||
} | ||
|
||
char chr; | ||
p = getChar(p, &chr); | ||
text->append(1, chr); | ||
} | ||
} | ||
} | ||
|
||
if (*p) | ||
p += strlen(endTag); | ||
|
||
return (*p) ? p : NULL; | ||
} | ||
|
||
const char* TiHtmBase::getEntity(const char *p, char *value) | ||
{ | ||
for (int i = 0; i < NUM_ENTITY; i++) | ||
{ | ||
if (strncmp(p, entity[i].str, entity[i].strLength) == 0) | ||
{ | ||
*value = entity[i].chr; | ||
return (p + entity[i].strLength); | ||
} | ||
} | ||
|
||
*value = *p; | ||
return (p + 1); | ||
} | ||
|
||
const char *TiHtmBase::getChar(const char *p, char *value) | ||
{ | ||
if (*p == '&') | ||
{ | ||
return getEntity(p, value); | ||
} | ||
else | ||
{ | ||
*value = *p; | ||
return (p + 1); | ||
} | ||
} | ||
|
||
/// if tag is p's prefix, return true, else return false | ||
bool TiHtmBase::stringEqual(const char *p, const char *tag, bool ignoreCase) | ||
{ | ||
if (!p || !*p) | ||
{ | ||
assert(0); | ||
return false; | ||
} | ||
|
||
const char *pstart = p; | ||
|
||
if (ignoreCase) | ||
{ | ||
while (*pstart && *tag && toLower(*pstart) == toLower(*tag)) | ||
{ | ||
pstart++; | ||
tag++; | ||
} | ||
|
||
if (!*tag) | ||
return true; | ||
} | ||
else | ||
{ | ||
while (*pstart && *tag && *pstart == *tag) | ||
{ | ||
pstart++; | ||
tag++; | ||
} | ||
|
||
if (!*tag) | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
int TiHtmBase::isAlpha(unsigned char c) | ||
{ | ||
if (c < 127) | ||
return isalpha(c); | ||
else | ||
return false; | ||
} | ||
|
||
int TiHtmBase::isAlnum(unsigned char c) | ||
{ | ||
if (c < 127) | ||
return isalnum(c); | ||
else | ||
return false; | ||
} |