diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b4a7dbd --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +htmtest.cpp +htmtest.html + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d024dd1 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +objects=tinyhtm.o tinyhtmparser.o htmtest.o + +htmtest: $(objects) + g++ -o htmtest $(objects) + +htmtest.o: htmtest.cpp + g++ -c htmtest.cpp + +tinyhtm.o: tinyhtm.h tinyhtm.cpp + g++ -c tinyhtm.cpp + +tinyhtmparser.o: tinyhtm.h tinyhtmparser.cpp + g++ -c tinyhtmparser.cpp + +clean: + rm $(objects) diff --git a/readme b/readme new file mode 100644 index 0000000..b57e0e4 --- /dev/null +++ b/readme @@ -0,0 +1 @@ + This is a simple html file parser. diff --git a/tinyhtm.cpp b/tinyhtm.cpp new file mode 100644 index 0000000..330e7ff --- /dev/null +++ b/tinyhtm.cpp @@ -0,0 +1,72 @@ +#include "tinyhtm.h" + +const char *TIHTMVERSION = "intyhtml-0.1"; + +const char *TiHtmVersion() +{ + return TIHTMVERSION; +} +static FILE *TiHtmOpen(const char *filename, const char *mode) +{ + return fopen(filename, mode); +} + +// the scope of class TiHtmBase + +// the scope of class TiHtmDocument + +bool TiHtmDocument::loadFile() +{ + return loadFile(value.c_str()); +} + +bool TiHtmDocument::loadFile(std::string filename) +{ + return loadFile(filename.c_str()); +} + +bool TiHtmDocument::loadFile(const char *filename) +{ + if (!filename && value.empty()) + { + TiHtmCout << "filename is null && value is empty" << TiHtmEndl; + exit(-1); + } + + FILE *file = TiHtmOpen(filename, "rb"); + + if (file) + { + bool result = loadFile(file); + fclose(file); + return result; + } + else + { + return false; + } +} + +bool TiHtmDocument::loadFile(FILE *file) +{ + if (!file) + exit(-1); + + long length = 0; + fseek(file, 0, SEEK_END); + length = ftell(file); + fseek(file, 0, SEEK_SET); + + char *buf = new char[length + 1]; + + if (fread(buf, length, 1, file) != 1) + { + exit (-1); + } + buf[length] = '\0'; + + TiHtmCout << buf << TiHtmEndl; + + delete []buf; +} + diff --git a/tinyhtm.h b/tinyhtm.h new file mode 100644 index 0000000..359e50e --- /dev/null +++ b/tinyhtm.h @@ -0,0 +1,111 @@ +#ifndef TINYHTM_H +#define TINYHTM_H + +#include +#include + +#include +#include +#include +#include +#include + +#define TiHtmCout std::cout +#define TiHtmCin std::cin +#define TiHtmEndl std::endl + + +/// record the location +struct TiHtmCursor +{ + TiHtmCursor() { clear(); } + void clear() { row = column = -1; } + + int row; + int column; +}; + + +/// the base class of whole tinyhtml +class TiHtmBase +{ +public: + TiHtmBase() {} + virtual ~TiHtmBase() {} + + virtual void print() const = 0; + virtual const char *parse(const char *p) = 0; + + int row() const { return location.row; } + int column() const { return location.row; } + + //static void encodeString(const char *str, char *out); // not write code + +protected: + static bool isWhiteSpace(char c); + static bool isWhiteSpace(int c) + { + if (c < 256) + return isWhiteSpace((char)c); + return false; + } + static const char *skipWhiteSpace(const char *p); + + static const char *readName(const char *p, std::string *name); + static const char *readText(const char *p, std::string *text, bool trimWhiteSpace, const char *endTag, bool ignoreCase); + /// 把转义字符转化为原来的字符,比如"<"转化为 '<' + static const char *getEntity(const char *p, char *value); + /// 从输入流中获取一个字符,有可能转换转义字符 + static const char *getChar(const char *p, char *value); + + /// if tge is p's prefix, return true, else return false + static bool stringEqual(const char *p, const char *tag, bool ignoreCase); + + // record the location message + TiHtmCursor location; + + static int isAlpha(unsigned char c); + static int isAlnum(unsigned char c); + static int toLower(int c) { return tolower(c); } + +private: + TiHtmBase(const TiHtmBase &); // not allowd + void operator=(const TiHtmBase &base); // not allowd + + struct Entity + { + const char *str; + unsigned int strLength; + const char chr; + }; + enum + { + NUM_ENTITY = 5, + MAX_ENTITY_LENGTH = 6 + }; + + static Entity entity[NUM_ENTITY]; // defined in file tinyhtmparser.cpp +}; + + +class TiHtmDocument +{ +public: + TiHtmDocument() : value("") {} + TiHtmDocument(std::string filename) : value(filename) {} + + bool loadFile(); + bool loadFile(const char *filename); + bool loadFile(const std::string filename); + bool loadFile(FILE *file); + + const std::string getValue() const { return value; } + const char *getValueCstr()const { return value.c_str(); } + +private: + // This should not in here, it will in TiHtmNode + std::string value; +}; + +#endif + diff --git a/tinyhtmparser.cpp b/tinyhtmparser.cpp new file mode 100644 index 0000000..1ab7f2a --- /dev/null +++ b/tinyhtmparser.cpp @@ -0,0 +1,177 @@ +#include "tinyhtm.h" + +// 定义的转义字符数组 +TiHtmBase::Entity TiHtmBase::entity[TiHtmBase::NUM_ENTITY] = + { + {"&", 5, '&'}, + {"<", 4, '<'}, + {">", 4, '>'}, + {""", 6, '\"'}, + {"'", 6, '\''} + }; + + +// the scope of class TiHtmBase + +bool TiHtmBase::isWhiteSpace(char c) +{ + return (isspace((unsigned char)c) || c == '\n' || c == '\r'); +} + +const char *TiHtmBase::skipWhiteSpace(const char *p) +{ + while (p && *p && isWhiteSpace(*p)) + { + p++; + } + return p; +} + +const char *TiHtmBase::readName(const char *p, std::string *name) +{ + *name = ""; + assert(p); + + if (*p && (isAlpha(*p) || *p == '_')) + { + const char *pstart = p; + while (*p && (isAlpha(*p) || *p == '_' || *p == '-' || *p == '.' || *p == ':')) + { + p++; + } + if (p - pstart > 0) + { + name->assign(pstart, p - pstart); + } + return p; + } + + return NULL; +} + +const char *TiHtmBase::readText(const char *p, std::string *text, bool trimWhiteSpace, const char *endTag, bool ignoreCase) +{ + *text = ""; + assert(p); + + if (!trimWhiteSpace) + { + while (*p && !stringEqual(p, endTag, ignoreCase)) + { + char chr; + p = getChar(p, &chr); + text->append(1, chr); + } + } + else + { + bool whiteSpace = false; + + p = skipWhiteSpace(p); + while (*p && !stringEqual(p, endTag, ignoreCase)) + { + if (isWhiteSpace(*p)) + { + whiteSpace = true; + p++; + } + else + { + if (whiteSpace) + { + text->append(1, ' '); + whiteSpace = false; + } + + char chr; + p = getChar(p, &chr); + text->append(1, chr); + } + } + } + + if (*p) + p += strlen(endTag); + + return (*p) ? p : NULL; +} + +const char* TiHtmBase::getEntity(const char *p, char *value) +{ + for (int i = 0; i < NUM_ENTITY; i++) + { + if (strncmp(p, entity[i].str, entity[i].strLength) == 0) + { + *value = entity[i].chr; + return (p + entity[i].strLength); + } + } + + *value = *p; + return (p + 1); +} + +const char *TiHtmBase::getChar(const char *p, char *value) +{ + if (*p == '&') + { + return getEntity(p, value); + } + else + { + *value = *p; + return (p + 1); + } +} + +/// if tag is p's prefix, return true, else return false +bool TiHtmBase::stringEqual(const char *p, const char *tag, bool ignoreCase) +{ + if (!p || !*p) + { + assert(0); + return false; + } + + const char *pstart = p; + + if (ignoreCase) + { + while (*pstart && *tag && toLower(*pstart) == toLower(*tag)) + { + pstart++; + tag++; + } + + if (!*tag) + return true; + } + else + { + while (*pstart && *tag && *pstart == *tag) + { + pstart++; + tag++; + } + + if (!*tag) + return true; + } + return false; +} + +int TiHtmBase::isAlpha(unsigned char c) +{ + if (c < 127) + return isalpha(c); + else + return false; +} + +int TiHtmBase::isAlnum(unsigned char c) +{ + if (c < 127) + return isalnum(c); + else + return false; +}