first commit, add class TiHtmBase

Eternity1987 · Sep 29, 2015 · 33ce9a9 · 33ce9a9
commit 33ce9a9
Show file tree

Hide file tree

Showing 6 changed files with 380 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+htmtest.cpp
+htmtest.html
+
diff --git a/Makefile b/Makefile
@@ -0,0 +1,16 @@
+objects=tinyhtm.o tinyhtmparser.o htmtest.o
+
+htmtest: $(objects)
+	g++ -o htmtest $(objects)
+
+htmtest.o: htmtest.cpp
+	g++ -c htmtest.cpp
+
+tinyhtm.o: tinyhtm.h tinyhtm.cpp
+	g++ -c tinyhtm.cpp
+
+tinyhtmparser.o: tinyhtm.h tinyhtmparser.cpp
+	g++ -c tinyhtmparser.cpp
+
+clean:
+	rm $(objects)
diff --git a/readme b/readme
@@ -0,0 +1 @@
+	This is a simple html file parser.
diff --git a/tinyhtm.cpp b/tinyhtm.cpp
@@ -0,0 +1,72 @@
+#include "tinyhtm.h"
+
+const char *TIHTMVERSION = "intyhtml-0.1";
+
+const char *TiHtmVersion()
+{
+	return TIHTMVERSION;
+}
+static FILE *TiHtmOpen(const char *filename, const char *mode)
+{
+	return fopen(filename, mode);
+}
+
+// the scope of class TiHtmBase
+
+// the scope of class TiHtmDocument
+
+bool TiHtmDocument::loadFile()
+{
+	return loadFile(value.c_str());
+}
+
+bool TiHtmDocument::loadFile(std::string filename)
+{
+	return loadFile(filename.c_str());
+}
+
+bool TiHtmDocument::loadFile(const char *filename)
+{
+	if (!filename && value.empty())
+	{
+		TiHtmCout << "filename is null && value is empty" << TiHtmEndl;
+		exit(-1);
+	}
+
+	FILE *file = TiHtmOpen(filename, "rb");
+
+	if (file)
+	{
+		bool result = loadFile(file);
+		fclose(file);
+		return result;
+	}
+	else
+	{
+		return false;
+	}
+}
+
+bool TiHtmDocument::loadFile(FILE *file)
+{
+	if (!file)
+		exit(-1);
+
+	long length = 0;
+	fseek(file, 0, SEEK_END);
+	length = ftell(file);
+	fseek(file, 0, SEEK_SET);
+
+	char *buf = new char[length + 1];
+
+	if (fread(buf, length, 1, file) != 1)
+	{
+		exit (-1);
+	}
+	buf[length] = '\0';
+
+	TiHtmCout << buf << TiHtmEndl;
+
+	delete []buf;
+}
+
diff --git a/tinyhtm.h b/tinyhtm.h
@@ -0,0 +1,111 @@
+#ifndef TINYHTM_H
+#define TINYHTM_H
+
+#include <iostream>
+#include <string>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+
+#define TiHtmCout std::cout
+#define TiHtmCin  std::cin
+#define TiHtmEndl std::endl
+
+
+/// record the location
+struct TiHtmCursor
+{
+	TiHtmCursor() { clear(); }
+	void clear() { row = column = -1; }
+
+	int row;
+	int column;
+};
+
+
+/// the base class of whole tinyhtml
+class TiHtmBase
+{
+public:
+	TiHtmBase() {}
+	virtual ~TiHtmBase() {}
+
+	virtual void print() const = 0;
+	virtual const char *parse(const char *p) = 0;
+
+	int row() const { return location.row; }
+	int column() const { return location.row; }
+
+	//static void encodeString(const char *str, char *out); // not write code
+
+protected:
+	static bool isWhiteSpace(char c);
+	static bool isWhiteSpace(int c) 
+	{ 
+		if (c < 256)
+			return isWhiteSpace((char)c);
+		return false;
+	}
+	static const char *skipWhiteSpace(const char *p);
+
+	static const char *readName(const char *p, std::string *name);
+	static const char *readText(const char *p, std::string *text, bool trimWhiteSpace, const char *endTag, bool ignoreCase);
+	/// 把转义字符转化为原来的字符，比如"&lt;"转化为 '<'
+	static const char *getEntity(const char *p, char *value);
+	/// 从输入流中获取一个字符，有可能转换转义字符
+	static const char *getChar(const char *p, char *value);
+
+	/// if tge is p's prefix, return true, else return false
+	static bool stringEqual(const char *p, const char *tag, bool ignoreCase);
+
+	// record the location message
+	TiHtmCursor location;
+
+	static int isAlpha(unsigned char c);
+	static int isAlnum(unsigned char c);
+	static int toLower(int c) { return tolower(c); }
+
+private:
+	TiHtmBase(const TiHtmBase &);			// not allowd
+	void operator=(const TiHtmBase &base); // not allowd
+
+	struct Entity
+	{
+		const char *str;
+		unsigned int strLength;
+		const char chr;
+	};
+	enum
+	{
+		NUM_ENTITY = 5,
+		MAX_ENTITY_LENGTH = 6
+	};
+
+	static Entity entity[NUM_ENTITY]; // defined in file tinyhtmparser.cpp
+};
+
+
+class TiHtmDocument
+{
+public:
+	TiHtmDocument() : value("") {}
+	TiHtmDocument(std::string filename) : value(filename) {}
+
+	bool loadFile();
+	bool loadFile(const char *filename);
+	bool loadFile(const std::string filename);
+	bool loadFile(FILE *file);
+
+	const std::string getValue() const { return value; } 
+	const char *getValueCstr()const  { return value.c_str(); }
+
+private:
+	// This should not in here, it will in TiHtmNode
+	std::string value;
+};
+
+#endif
+
diff --git a/tinyhtmparser.cpp b/tinyhtmparser.cpp
@@ -0,0 +1,177 @@
+#include "tinyhtm.h"
+
+// 定义的转义字符数组
+TiHtmBase::Entity TiHtmBase::entity[TiHtmBase::NUM_ENTITY] = 
+	{
+		{"&amp;", 5, '&'},
+		{"&lt;", 4, '<'},
+		{"&gt;", 4, '>'},
+		{"&quot;", 6, '\"'},
+		{"&apos;", 6, '\''}
+	};
+
+
+// the scope of class TiHtmBase
+
+bool TiHtmBase::isWhiteSpace(char c)
+{
+	return (isspace((unsigned char)c) || c == '\n' || c == '\r');
+}
+
+const char *TiHtmBase::skipWhiteSpace(const char *p)
+{
+	while (p && *p && isWhiteSpace(*p))
+	{
+		p++;
+	}
+	return p;
+}
+
+const char *TiHtmBase::readName(const char *p, std::string *name)
+{
+	*name = "";
+	assert(p);
+
+	if (*p && (isAlpha(*p) || *p == '_'))
+	{
+		const char *pstart = p;
+		while (*p && (isAlpha(*p) || *p == '_' || *p == '-' || *p == '.' || *p == ':'))
+		{
+			p++;
+		}
+		if (p - pstart > 0)
+		{
+			name->assign(pstart, p - pstart);
+		}
+		return p;
+	}
+
+	return NULL;
+}
+
+const char *TiHtmBase::readText(const char *p, std::string *text, bool trimWhiteSpace, const char *endTag, bool ignoreCase)
+{
+	*text = "";
+	assert(p);
+
+	if (!trimWhiteSpace)
+	{
+		while (*p && !stringEqual(p, endTag, ignoreCase))
+		{
+			char chr;
+			p = getChar(p, &chr);
+			text->append(1, chr);
+		}
+	}
+	else
+	{
+		bool whiteSpace = false;
+
+		p = skipWhiteSpace(p);
+		while (*p && !stringEqual(p, endTag, ignoreCase))
+		{
+			if (isWhiteSpace(*p))
+			{
+				whiteSpace = true;
+				p++;
+			}
+			else
+			{
+				if (whiteSpace)
+				{
+					text->append(1, ' ');
+					whiteSpace = false;
+				}
+
+				char chr;
+				p = getChar(p, &chr);
+				text->append(1, chr);
+			}
+		}
+	}
+
+	if (*p)
+		p += strlen(endTag);
+
+	return (*p) ? p : NULL;
+}
+
+const char* TiHtmBase::getEntity(const char *p, char *value)
+{
+	for (int i = 0; i < NUM_ENTITY; i++)
+	{
+		if (strncmp(p, entity[i].str, entity[i].strLength) == 0)
+		{
+			*value = entity[i].chr;
+			return (p + entity[i].strLength);
+		}
+	}
+
+	*value = *p;
+	return (p + 1);
+}
+
+const char *TiHtmBase::getChar(const char *p, char *value)
+{
+	if (*p == '&')
+	{
+		return getEntity(p, value);
+	}
+	else
+	{
+		*value = *p;
+		return (p + 1);
+	}
+}
+
+/// if tag is p's prefix, return true, else return false
+bool TiHtmBase::stringEqual(const char *p, const char *tag, bool ignoreCase)
+{
+	if (!p || !*p)
+	{
+		assert(0);
+		return false;
+	}
+
+	const char *pstart = p;
+
+	if (ignoreCase)
+	{
+		while (*pstart && *tag && toLower(*pstart) == toLower(*tag))
+		{
+			pstart++;
+			tag++;
+		}
+
+		if (!*tag)
+			return true;
+	}
+	else
+	{
+		while (*pstart && *tag && *pstart == *tag)
+		{
+			pstart++;
+			tag++;
+		}
+
+		if (!*tag)
+			return true;
+	}
+	return false;
+}
+
+int TiHtmBase::isAlpha(unsigned char c)
+{
+	if (c < 127)
+		return isalpha(c);
+	else
+		return false;
+}
+
+int TiHtmBase::isAlnum(unsigned char c)
+{
+	if (c < 127)
+		return isalnum(c);
+	else
+		return false;
+}