当前位置:首页 > Python爬虫 > Python多线程抓豆瓣图书

Python多线程抓豆瓣图书

作者:二营长 发布时间:2016-12-29 点击:

今天二营长又有了新的需求,需要抓豆瓣的图书信息。于是又拼了个轮子。

#coding:utf-8
import requests,re,time,string,random,sys,threading
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
reload(sys)
sys.setdefaultencoding('utf-8')
author = 'heziliang'
totalThread = 1
def getUA():
	uaList = [
		"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
		"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
		"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
	]
	newUa = random.choice(uaList) 
	return newUa

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, sdch, br",
    "Accept-Language": "zh-CN,zh;q=0.8",
	"Cache-Control":"max-age=0",
    "Connection": "keep-alive",
	"Host": "book.douban.com",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": getUA(), 
}
#27046
def gofuckyouself(id):
	try:
		r = requests.get(id,headers=headers)
	except Exception,e:
		print e
	if "paginator" not in r.content:		
		s = BeautifulSoup(r.content,"lxml")		
		for wz in s.find_all('h2'):
			try:
				print wz.get_text().replace('\n','').replace(' ','')
				mutex.acquire()	
				with open('result.txt',r'a+') as my:
					my.write(wz.get_text().replace('\n','').replace(' ','').encode('utf-8')+'\n')
				with open('hasdone.txt',r'a+') as you:
					you.write(id+'\n')
				mutex.release()

			except Exception,e:
				print e
				pass
	if "paginator" in r.content:
		print 'has paginator'
		s = BeautifulSoup(r.content,"lxml")
		for wz in s.find_all('h2'):
			try:
				print wz.get_text().replace('\n','').replace(' ','')
				mutex.acquire()
				with open('result.txt',r'a+') as my:
					my.write(wz.get_text().replace('\n','').replace(' ','').encode('utf-8')+'\n')
				with open('hasdone.txt',r'a+') as you:
					you.write(id+'\n')
				mutex.release()
			except Exception,e:
				print e
				pass
		for fenpage in s.find('div',attrs={'class':'paginator'}).find_all('a'):
			if fenpage.get_text().isdigit():
				print fenpage.get('href')
				r_paginator = requests.get(fenpage.get('href'),headers=headers)
				s_paginator = BeautifulSoup(r_paginator.content,"lxml")
				for title in s_paginator.find_all('h2'):
					try:
						print title.get_text().replace('\n','').replace(' ','')
						mutex.acquire()
						with open('result.txt',r'a+') as my:
							my.write(title.get_text().replace('\n','').replace(' ','').encode('utf-8')+'\n')
						mutex.release()
					except Exception,e:
						print e
						pass			
def main():
	urlNum = 0
	url_list = []	
	urlNum_list = []
	for i in range(4142,27047):
		url = 'https://book.douban.com/series/%s'%i		
		url_list.append(url)
		urlNum_list.append(urlNum)
		urlNum += 1		
	pool = ThreadPool(totalThread)
	pool.map(gofuckyouself, url_list)
	pool.close() 
	pool.join()
mutex = threading.Lock()
main()

更多代码到github:https://github.com/hzlRises

邮箱:techseo.cn@gmail.com,欢迎交流。
上一篇:【神器】Python批量生成sitemap      下一篇:百度无限制主动推送