Skip to content

Commit

Permalink
Support novel search on sites
Browse files Browse the repository at this point in the history
  • Loading branch information
ma6254 committed Apr 12, 2019
1 parent 27016fa commit 70e4aa8
Show file tree
Hide file tree
Showing 17 changed files with 904 additions and 497 deletions.
32 changes: 27 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,37 @@

# 有时会发生`not match volumes`的错误,请启用Chromedp或者PhantomJS
# Use Chromedp
> ./FictionDown --url https://book.qidian.com/info/3249362 d --driver chromedp
> ./FictionDown --url https://book.qidian.com/info/3249362 -d chromedp d
# Use PhantomJS
> ./FictionDown --url https://book.qidian.com/info/3249362 d --driver phantomjs
> ./FictionDown --url https://book.qidian.com/info/3249362 -d phantomjs d

> vim 一世之尊.FictionDown # 加入盗版小说链接
> ./FictionDown -i 一世之尊.FictionDown d # 获取盗版内容
# 爬取完毕就可以输出可阅读的文档了
> ./FictionDown -i 一世之尊.FictionDown conv -f txt
# 转换成epub有两种方式
# 1.输出markdown,再用pandoc转换成epub
> ./FictionDown -i 一世之尊.FictionDown conv -f md
> pandoc -o 一世之尊.epub 一世之尊.md
# 2.直接输出epub(某些阅读器会报错)
> ./FictionDown -i 一世之尊.FictionDown conv -f epub
```

#### 现在支持小说站内搜索,可以不用手动填入了

```bash
> ./FictionDown --url https://book.qidian.com/info/3249362 d # 获取正版信息

# 有时会发生`not match volumes`的错误,请启用Chromedp或者PhantomJS
# Use Chromedp
> ./FictionDown --url https://book.qidian.com/info/3249362 --driver chromedp d
# Use PhantomJS
> ./FictionDown --url https://book.qidian.com/info/3249362 --driver phantomjs d

> ./FictionDown -i 一世之尊.FictionDown s -k 一世之尊 -p # 搜索然后放入
> ./FictionDown -i 一世之尊.FictionDown d # 获取盗版内容
# 爬取完毕就可以输出可阅读的文档了
> ./FictionDown -i 一世之尊.FictionDown conv -f txt
# 转换成epub有两种方式
# 1.输出markdown,再用pandoc转换成epub
> ./FictionDown -i 一世之尊.FictionDown conv -f md
Expand All @@ -67,11 +91,9 @@
- 支持 晋江文学城
- 支持 纵横中文网
- 支持刺猬猫(即“欢乐书客”)
- 支持直接输出epub,不需要pandoc
- 支持小说站内搜索
- ~~支持小说站内搜索~~
- 整理main包中的面条逻辑
- 整理命令行参数风格
- 在windows下,md转换到epub时有路径问题
- 完善广告过滤
- 简化使用步骤
- 优化log输出
Expand Down
4 changes: 0 additions & 4 deletions cmd/FictionDown/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ var download = cli.Command{
Usage: "线程数",
Value: 10,
},
cli.StringFlag{
Name: "driver",
Usage: "请求方式,support: none,phantomjs,chromedp",
},
cli.StringFlag{
Name: "f",
Usage: "输出格式",
Expand Down
4 changes: 3 additions & 1 deletion cmd/FictionDown/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ var app = &cli.App{
Usage: "log file path",
},
cli.StringFlag{
Name: "driver",
Name: "driver,d",
Usage: "请求方式,support: none,phantomjs,chromedp",
Destination: &driver,
},
Expand All @@ -67,6 +67,8 @@ var app = &cli.App{
check,
edit,
convert,
pirate,
search,
},
}

Expand Down
25 changes: 25 additions & 0 deletions cmd/FictionDown/pirate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package main

import (
"log"

"github.com/ma6254/FictionDown/site"
"github.com/urfave/cli"
)

var pirate = cli.Command{
Name: "pirate",
Aliases: []string{"p"},
Usage: "检索盗版站点",
Flags: []cli.Flag{},
Action: func(c *cli.Context) error {
a := "https://www.biqiuge.com/book/4772/480965712.html/"

s, err := site.MatchOne(site.Sitepool, a)
if err != nil {
return err
}
log.Printf("匹配站点: %s %#v", s.Name, s.HomePage)
return nil
},
}
65 changes: 65 additions & 0 deletions cmd/FictionDown/search.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package main

import (
"fmt"
"io/ioutil"
"log"

"github.com/ma6254/FictionDown/utils"

"github.com/go-yaml/yaml"
"github.com/ma6254/FictionDown/site"

"github.com/urfave/cli"
)

var search = cli.Command{
Name: "search",
Aliases: []string{"s"},
Usage: "检索盗版站点",
Flags: []cli.Flag{
cli.StringFlag{
Name: "k,keyword",
Usage: "搜索关键词",
},
cli.BoolFlag{
Name: "put,p",
Usage: "对比并放入缓存文件",
},
},
Action: func(c *cli.Context) error {
keyword := c.String("keyword")
r, err := site.Search(keyword)
if err != nil {
return err
}
if !c.Bool("put") {
fmt.Printf("搜索到%d个内容:\n", len(r))
for _, v := range r {
fmt.Printf("%s %s %s\n", v.BookURL, v.BookName, v.Author)
}
} else {
err := initLoadStore(c)
if err != nil {
return err
}
rrr := []site.ChaperSearchResult{}
for _, v := range r {
if (v.Author == chapter.Author) && (v.BookName == chapter.BookName) {
log.Printf("%s %s %s", v.BookURL, v.BookName, v.Author)
rrr = append(rrr, v)
}
}
for _, v := range rrr {
chapter.Tmap = append(chapter.Tmap, v.BookURL)
}
chapter.Tmap = utils.TupleSlice(chapter.Tmap)
b, err := yaml.Marshal(chapter)
if err != nil {
return err
}
ioutil.WriteFile(filename, b, 0775)
}
return nil
},
}
43 changes: 3 additions & 40 deletions matching/matching.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package matching

import (
"regexp"

"github.com/ma6254/FictionDown/utils"
)

//TitleAlias 获取标题别称
Expand Down Expand Up @@ -30,7 +32,7 @@ func TitleAlias(s string) (alias []string) {
if v == s {
continue
}
if StringInSlice(v, alias) {
if utils.StringInSlice(v, alias) {
continue
}
alias = append(alias, v)
Expand All @@ -39,42 +41,3 @@ func TitleAlias(s string) (alias []string) {
}
return
}

// StringInSlice string in []stirng like python "if a in b" keyword
func StringInSlice(s string, ss []string) bool {
for _, v := range ss {
if s == v {
return true
}
}
return false
}

//TupleSlice 去除重复字符串
func TupleSlice(a []string) []string {
b := make([]string, len(a))
ia := make([]int, len(a))
for k, v := range a {
if ia[k] == 0 {
b = append(b, v)
}
ia[k]++
}
return b
}

//SimilarSlice 对比两个字符串组,得到其中相等字符串的数量,"i < len(a)" and "i < len(b)"
func SimilarSlice(a, b []string) (i int) {
a = TupleSlice(a)
b = TupleSlice(b)
for _, va := range a {
B:
for _, vb := range b {
if va == vb {
i++
break B
}
}
}
return
}
124 changes: 92 additions & 32 deletions site/81new.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,18 @@ import (
"fmt"
"io"
"net/url"
"regexp"
"strings"

"github.com/ma6254/FictionDown/utils"

"github.com/antchfx/htmlquery"
"github.com/ma6254/FictionDown/store"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
)

// Www81newCom 新八一中文网
type Www81newCom struct {
}

func (b *Www81newCom) BookInfo(body io.Reader) (s *store.Store, err error) {
func wwww81newcomBookInfo(body io.Reader) (s *store.Store, err error) {
body = transform.NewReader(body, simplifiedchinese.GBK.NewDecoder())
doc, err := htmlquery.Parse(body)
if err != nil {
Expand Down Expand Up @@ -79,35 +78,96 @@ func (b *Www81newCom) BookInfo(body io.Reader) (s *store.Store, err error) {
return
}

func (b *Www81newCom) Chapter(body io.Reader) ([]string, error) {
body = transform.NewReader(body, simplifiedchinese.GBK.NewDecoder())
doc, err := htmlquery.Parse(body)
if err != nil {
return nil, err
}
var wwww81newcom = SiteA{
Name: "新八一中文网",
HomePage: "https://www.81new.com/",
Match: []string{
`https://www\.81new\.com/\d+/\d+/*`,
`https://www\.81new\.com/\d+/\d+/d+\.html`,
},
BookInfo: wwww81newcomBookInfo,
Chapter: func(body io.Reader) ([]string, error) {
body = transform.NewReader(body, simplifiedchinese.GBK.NewDecoder())
doc, err := htmlquery.Parse(body)
if err != nil {
return nil, err
}

M := []string{}
//list
// nodeContent := htmlquery.Find(doc, `//div[@id="content"]/text()`)
nodeContent := htmlquery.Find(doc, `//*[@id="articlecontent"]/text()`)
if len(nodeContent) == 0 {
err = fmt.Errorf("No matching content")
return nil, err
}
for _, v := range nodeContent {
t := htmlquery.InnerText(v)
t = strings.TrimSpace(t)

switch t {
case
"[八一中文网 请记住",
"手机版访问 m.81new.com 绿色无弹窗]",
"":
continue
M := []string{}
//list
// nodeContent := htmlquery.Find(doc, `//div[@id="content"]/text()`)
nodeContent := htmlquery.Find(doc, `//*[@id="articlecontent"]/text()`)
if len(nodeContent) == 0 {
err = fmt.Errorf("No matching content")
return nil, err
}
for _, v := range nodeContent {
t := htmlquery.InnerText(v)
t = strings.TrimSpace(t)

switch t {
case
"[八一中文网 请记住",
"手机版访问 m.81new.com 绿色无弹窗]",
"":
continue
}

M = append(M, t)
}

M = append(M, t)
}
return M, nil
},
Search: func(s string) (result []ChaperSearchResult, err error) {
baseurl, err := url.Parse("https://www.81new.com/modules/article/search.php")
if err != nil {
return
}
value := baseurl.Query()
gbk_word, _ := simplifiedchinese.GBK.NewEncoder().String(s)
value.Add("searchkey", gbk_word)
baseurl.RawQuery = value.Encode()

// Get WebPage

return M, nil
resp, err := utils.RequestGet(baseurl.String())
if err != nil {
return nil, err
}
defer resp.Body.Close()
if regexp.MustCompile(`/modules/article/search\.php`).MatchString(resp.Request.URL.Path) {
// 多个搜索结果
body := transform.NewReader(resp.Body, simplifiedchinese.GBK.NewDecoder())
doc, err := htmlquery.Parse(body)
if err != nil {
return nil, err
}
r := htmlquery.Find(doc, `//table[@id="author"]/tbody/tr`)
if len(r) == 0 {
return nil, nil
}
for _, v := range r[1:] {
a := htmlquery.FindOne(v, `/*[1]/a`)
r := ChaperSearchResult{
BookName: htmlquery.InnerText(a),
Author: htmlquery.InnerText(htmlquery.FindOne(v, `/*[3]`)),
BookURL: htmlquery.SelectAttr(a, "href"),
}
result = append(result, r)
}
} else if regexp.MustCompile(`/\d+/\d+/*`).MatchString(resp.Request.URL.Path) {
// 单个搜索结果
store, err := wwww81newcomBookInfo(resp.Body)
if err != nil {
return nil, err
}
result = append(result, ChaperSearchResult{
BookName: store.BookName,
Author: store.Author,
BookURL: resp.Request.URL.String(),
})
}

return
},
}
Loading

0 comments on commit 70e4aa8

Please sign in to comment.