今天二营长又有了新的需求,需要抓豆瓣的图书信息。于是又拼了个轮子。
#coding:utf-8 import requests,re,time,string,random,sys,threading from bs4 import BeautifulSoup from multiprocessing.dummy import Pool as ThreadPool reload(sys) sys.setdefaultencoding('utf-8') author = 'heziliang' totalThread = 1 def getUA(): uaList = [ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", ] newUa = random.choice(uaList) return newUa headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch, br", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control":"max-age=0", "Connection": "keep-alive", "Host": "book.douban.com", "Upgrade-Insecure-Requests": "1", "User-Agent": getUA(), } #27046 def gofuckyouself(id): try: r = requests.get(id,headers=headers) except Exception,e: print e if "paginator" not in r.content: s = BeautifulSoup(r.content,"lxml") for wz in s.find_all('h2'): try: print wz.get_text().replace('\n','').replace(' ','') mutex.acquire() with open('result.txt',r'a+') as my: my.write(wz.get_text().replace('\n','').replace(' ','').encode('utf-8')+'\n') with open('hasdone.txt',r'a+') as you: you.write(id+'\n') mutex.release() except Exception,e: print e pass if "paginator" in r.content: print 'has paginator' s = BeautifulSoup(r.content,"lxml") for wz in s.find_all('h2'): try: print wz.get_text().replace('\n','').replace(' ','') mutex.acquire() with open('result.txt',r'a+') as my: my.write(wz.get_text().replace('\n','').replace(' ','').encode('utf-8')+'\n') with open('hasdone.txt',r'a+') as you: you.write(id+'\n') mutex.release() except Exception,e: print e pass for fenpage in s.find('div',attrs={'class':'paginator'}).find_all('a'): if fenpage.get_text().isdigit(): print fenpage.get('href') r_paginator = requests.get(fenpage.get('href'),headers=headers) s_paginator = BeautifulSoup(r_paginator.content,"lxml") for title in s_paginator.find_all('h2'): try: print title.get_text().replace('\n','').replace(' ','') mutex.acquire() with open('result.txt',r'a+') as my: my.write(title.get_text().replace('\n','').replace(' ','').encode('utf-8')+'\n') mutex.release() except Exception,e: print e pass def main(): urlNum = 0 url_list = [] urlNum_list = [] for i in range(4142,27047): url = 'https://book.douban.com/series/%s'%i url_list.append(url) urlNum_list.append(urlNum) urlNum += 1 pool = ThreadPool(totalThread) pool.map(gofuckyouself, url_list) pool.close() pool.join() mutex = threading.Lock() main()
更多代码到github:https://github.com/hzlRises