Go语言下有个爬虫软件pholcus,写了个爬虫的规则,抓的是人民网的最新新闻和IJGUC所有期刊。
pholcus开源软件做的还是挺棒的,但是觉得go语言不太好玩。
规则放到了Github

人民网新闻

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
package spider_lib

// 基础包
import (
"log"

// "github.com/PuerkitoBio/goquery" //DOM解析
"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
// "github.com/henrylee2cn/pholcus/logs" //信息输出
. "github.com/henrylee2cn/pholcus/app/spider" //必需
// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用

// net包
// "net/http" //设置http.Header
// "net/url"

// 编码包

// "encoding/xml"
"encoding/json"

// 字符串处理包
// "regexp"
// "strconv"
// "strings"
// 其他包
// "fmt"
// "math"
// "time"
)

func init() {
People.Register()
}

type Item struct {
Id string `json:"id"`
Title string `json:"title"`
Url string `json:"url"`
Date string `json:"date"`
NodeId string `json:"nodeId"`
ImgCount string `json:"imgCount"`
}
type News struct {
Items []Item `json:"items"`
}

var news News

var People = &Spider{
Name: "人民网新闻抓取",
Description: "人民网最新分类新闻",
// Pausetime: 300,
// Keyin: KEYIN,
// Limit: LIMIT,
EnableCookie: false,
RuleTree: &RuleTree{
Root: func(ctx *Context) {
ctx.AddQueue(&request.Request{
Method: "GET",
Url: "http://news.people.com.cn/210801/211150/index.js?cache=false",
Rule: "新闻列表",
})
},

Trunk: map[string]*Rule{
"新闻列表": {
ParseFunc: func(ctx *Context) {

//query := ctx.GetDom()
//str := query.Find("body").Text()

//str := `{"items":[{"id":"282","title":"人社 转型升级"战术"手册","url":"ht","date":"201","nodeId":"1001","imgCount":"4"}]}`

str := ctx.GetText()

err := json.Unmarshal([]byte(str), &news)
if err != nil {
log.Printf("解析错误: %v\n", err)
return
}
/////////////////
newsLength := len(news.Items)
for i := 0; i < newsLength; i++ {
ctx.AddQueue(&request.Request{
Url: news.Items[i].Url,
Rule: "热点新闻",
Temp: map[string]interface{}{
"id": news.Items[i].Id,
"title": news.Items[i].Title,
"date": news.Items[i].Date,
"newsType": news.Items[i].NodeId,
},
})
}
/////////////////
},
},

"热点新闻": {
//注意:有无字段语义和是否输出数据必须保持一致
ItemFields: []string{
"ID",
"标题",
"内容",
"类别",
"ReleaseTime",
},
ParseFunc: func(ctx *Context) {
query := ctx.GetDom()

// 获取内容
content := query.Find("#p_content").Text()
// re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
// content = re.ReplaceAllStringFunc(content, strings.ToLower)
// content = re.ReplaceAllString(content, "")

// 结果存入Response中转
ctx.Output(map[int]interface{}{
0: ctx.GetTemp("id", ""),
1: ctx.GetTemp("title", ""),
2: content,
3: ctx.GetTemp("newsType", ""),
4: ctx.GetTemp("date", ""),
})
},
},
},
},
}

IJGUC期刊

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
package spider_lib

// 基础包
import (
// "log"

"github.com/PuerkitoBio/goquery" //DOM解析
"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
. "github.com/henrylee2cn/pholcus/app/spider" //必需
// "github.com/henrylee2cn/pholcus/logs" //信息输出
// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用

// net包
// "net/http" //设置http.Header
// "net/url"

// 编码包
// "encoding/xml"
// "encoding/json"

// 字符串处理包
"regexp"
"strconv"
// "strings"

// 其他包
// "fmt"
// "math"
// "time"
)

func init() {
IJGUC.Register()
}

var IJGUC = &Spider{
Name: "IJGUC期刊",
Description: "IJGUC期刊",
// Pausetime: 300,
// Keyin: KEYIN,
// Limit: LIMIT,
EnableCookie: false,
RuleTree: &RuleTree{
Root: func(ctx *Context) {
ctx.AddQueue(&request.Request{
Url: "http://www.inderscience.com/info/inarticletoc.php?jcode=ijguc&year=2016&vol=7&issue=1",
Rule: "期刊列表",
})
},

Trunk: map[string]*Rule{
"期刊列表": {
ParseFunc: func(ctx *Context) {
query := ctx.GetDom()
for i := 1; i <= 7; i++ {
id := "#eventbody" + strconv.Itoa(i) + " a"
query.Find(id).Each(func(j int, s *goquery.Selection) {
if url, ok := s.Attr("href"); ok {
// log.Print(url)
ctx.AddQueue(&request.Request{Url: url, Rule: "文章列表"})
}
})
}
},
},
"文章列表": {
ParseFunc: func(ctx *Context) {
query := ctx.GetDom()
//#journalcol1 article table tbody tr td:eq(1) table:eq(1) a
query.Find("#journalcol1 article table tbody tr td").Each(func(i int, td *goquery.Selection) {
if i == 1 {
td.Find("table").Each(func(j int, table *goquery.Selection) {
if j == 1 {
table.Find("a").Each(func(k int, a *goquery.Selection) {
if k%2 == 0 {
if url, ok := a.Attr("href"); ok {
// log.Print(url)
ctx.AddQueue(&request.Request{Url: url, Rule: "文章页"})
}
}
})
}
})
}
})
},
},
"文章页": {
//注意:有无字段语义和是否输出数据必须保持一致
ItemFields: []string{
"Title",
"Author",
"Addresses",
"Journal",
"Abstract",
"Keywords",
"DOI",
},
ParseFunc: func(ctx *Context) {
query := ctx.GetDom()
// 获取内容
content := query.Find("#col1").Text()

// 过滤标签
re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
content = re.ReplaceAllString(content, "")

// Title
re, _ = regexp.Compile("Title:(.*?)Author:")
title := re.FindStringSubmatch(content)[1]
// Author
re, _ = regexp.Compile("Author:(.*?)Addresses:")
au := re.FindStringSubmatch(content)
var author string
if len(au) > 0 {
author = au[1]
} else {
re, _ = regexp.Compile("Author:(.*?)Address:")
author = re.FindStringSubmatch(content)[1]
}
// Addresses & Address
re, _ = regexp.Compile("Addresses:(.*?)Journal:")
address := re.FindStringSubmatch(content)
var addresses string
if len(address) > 0 {
addresses = address[1]
} else {
re, _ = regexp.Compile("Address:(.*?)Journal:")
addresses = re.FindStringSubmatch(content)[1]
}
// Journal
re, _ = regexp.Compile("Journal:(.*?)Abstract:")
journal := re.FindStringSubmatch(content)[1]
// Abstract
re, _ = regexp.Compile("Abstract:(.*?)Keywords:")
abstract := re.FindStringSubmatch(content)[1]
// Keywords
re, _ = regexp.Compile("Keywords:(.*?)DOI:")
keywords := re.FindStringSubmatch(content)[1]
// DOI
re, _ = regexp.Compile("DOI: ")
doiIndex := re.FindStringSubmatchIndex(content)
rs := []rune(content)
left := doiIndex[1] - 8
right := left + 43
doi := string(rs[left:right])

// 结果存入Response中转
ctx.Output(map[int]interface{}{
0: title,
1: author,
2: addresses,
3: journal,
4: abstract,
5: keywords,
6: doi,
})
},
},
},
},
}