from pyquery import PyQuery as pq
import requests
import time
import sys
import proxy
class SHY:
def __init__(self):
p = proxy.PROXY()
self.proxy_ip_list = p.get()
self.proxy_id = 0
def request_html(self, url):
try:
userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
head = {}
head['User-Agent'] = userAgent
print(self.proxy_ip_list[self.proxy_id])
response = requests.get(url=url, headers=head, timeout=8, proxies=self.proxy_ip_list[self.proxy_id])
if response.status_code != 200:
print('change proxy')
self.proxy_id += 1
self.request_html(url)
else:
response.encoding = 'gbk'
return response.text
except Exception as e:
print(e)
if self.proxy_id < len(self.proxy_ip_list):
self.proxy_id += 1
self.request_html(url)
def page_sort(self, page):
try:
url = 'http://www.cmshy.com/html/' + str(page) + '/index.html'
html = self.request_html(url)
if html == '':
return
html_content = pq(html)
title = html_content('.directory-box .book-tit').text()
tr_list = html_content('.directory-box .cell-items ul li')
for i in tr_list:
# title
sub_title = pq(i).find('a').text()
# url
sub_url = pq(i).find('a').attr('href')
content = self.detail_page(sub_url)
self.write_file(page, title, content)
time.sleep(2)
except Exception as e:
print('page sort error')
print(e)
self.page_sort(page)
def write_file(self, page, title, content):
try:
f = open('./txt/' + str(page) + '-' + title +'.txt', 'a+')
f.write(content)
f.close()
except Exception as e:
print('write file error')
print(e)
def detail_page(self, url):
try:
html = self.request_html(url)
if html == '':
return
html_content = pq(html)
read = html_content('.read-wrap')
c1 = pq(read)('.title').text()
c2 = pq(read)('.author').text()
c3 = pq(read)('.content').text()
return c1 + '\n' + c2 + '\n' + c3 + '\n\n'
except Exception as e:
print('detail page error')
print(e)
self.detail_page(url)
if __name__ == '__main__':
shy = SHY()
for i in range(1736, 5000):
print('~~~~~~~~~~ page '+str(i)+' ~~~~~~~~~~~')
shy.page_sort(i)
time.sleep(2)