#coding:utf-8 author = '二营长' #https://github.com/hzlRises/hzlgithub/blob/master/jd.com/book.py import requests,re,time,string from bs4 import BeautifulSoup headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", "Cookie": "", "Host": "www.kongfz.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", } for word in string.uppercase: r_fir = requests.get('http://www.kongfz.com/topic/%s/'%word,headers=headers) for page in re.findall(r'</span><a href="/topic/A-(\d+)/',r_fir.content): print page for i in range(1,int(page)+1): url = 'http://www.kongfz.com/topic/%s-%s/' %(word,i) r = requests.get(url,headers=headers) for title in re.findall(r'title="(.*)"',r.content): print title with open('%s.txt'%word,r'a+') as my: my.write(title+'\n') time.sleep(1.5) ''' #coding:utf-8 import requests,re,time,string,random from bs4 import BeautifulSoup def getUA(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" ] newUa = random.choice(uaList) return newUa headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", # "Cookie": "aliyungf_tc=AQAAAPCwKmNNIA0AE3MmagNhDFYh3JrH; PHPSESSID=d72mofht5iet5mnjf27nnjp440; kfz-tid=c522d23cb2034d2aff21a1c8e44acdf8; shoppingCartSessionId=d7728a04d2c91d78b29b4bc642eb43a6; Hm_lvt_bca7840de7b518b3c5e6c6d73ca2662c=1482982081; Hm_lpvt_bca7840de7b518b3c5e6c6d73ca2662c=1482982081; Hm_lvt_33be6c04e0febc7531a1315c9594b136=1482982081; Hm_lpvt_33be6c04e0febc7531a1315c9594b136=1482982081", "Host": "www.kongfz.com", "Upgrade-Insecure-Requests": "1", "User-Agent": getUA(), } word = 'E' r_fir = requests.get('http://www.kongfz.com/topic/%s/'%word,headers=headers) for page in re.findall(r'</span><a href="/topic/A-(\d+)/',r_fir.content): print page for i in range(1,int(page)+1): url = 'http://www.kongfz.com/topic/%s-%s/' %(word,i) r = requests.get(url,headers=headers) for title in re.findall(r'title="(.*)"',r.content): print title with open('%s.txt'%word,r'a+') as my: my.write(title+'\n') time.sleep(1.5) '''