-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpa_books.py
68 lines (60 loc) · 2.41 KB
/
pa_books.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
from lxml import etree
import time
import socket
#socket.setdefaulttimeout(20)
headers = {'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
"Cache-Control": 'max-age=0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
'Referer': 'http://www.baidu.com/'
}
def get_content(title, charpter, src):
try:
response = requests.get(src, headers=headers)
time.sleep(3)
html = etree.HTML(response.content)
src_list = html.xpath('//div[@class="entry-content"]/p')
# print(len(src_list))
for src in zip(src_list):
content = src[0].text
if content is not None:
content = content.encode('gbk', 'ignore').decode('gbk')
file_name = '.\\books\\' + title + '.txt'
with open(file_name, 'a') as f:
f.write(content + '\n')
# print(content)
response.close()
except Exception as e:
print(e)
def get_charpter(title, src):
response = requests.get(src, headers=headers)
time.sleep(3)
html = etree.HTML(response.content)
src_list = html.xpath('//div[@class="entry-content"]/ul/li[*]/a')
# print(len(src_list))
for src in zip(src_list):
charpter = str(src[0].text).encode('gbk', 'ignore').decode('gbk')
content_src = src[0].get('href')
print(charpter, content_src)
file_name = '.\\books\\' + title + '.txt'
with open(file_name, 'a') as f:
f.write(charpter + '\n')
response.close()
get_content(title, charpter, content_src)
response = requests.get("https://it.95590.org/", headers=headers)
html = etree.HTML(response.content)
title_list = html.xpath('//*[@id="categories"]/ul/li[*]/a/text()')
src_list = html.xpath('//*[@id="categories"]/ul/li[*]/a/@href')
book_list = ['刘强东·注定震惊世界', '任正非这个人', '雷军·人因梦想而伟大',
'张亚勤·让智慧起舞', '华为狼道', 'IBM帝国缔造者']
# print(len(title_list))
for title, src in zip(title_list, src_list):
title = str(title)[0:-5]
print(title, src)
if title not in book_list:
file_name = '.\\books\\' + title + '.txt'
with open(file_name, 'w') as f:
f.write(title + '\n')
response.close()
get_charpter(title, src)