爬天猫商品评价数量
#coding:utf-8 import requests,json,csv,sys,re reload(sys) sys.setdefaultencoding('utf-8') headers = { "accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "accept-encoding":"gzip, deflate, br", "accept-language":"zh-CN,zh;q=0.8", "cache-control":"max-age=0", "cookie":"**", "upgrade-insecure-requests":"1", "user-agent":"", } csvfile = open('result_tmall.csv',r'wb') writer = csv.writer(csvfile) writer.writerow(('link','allcom')) for link in open('get_com_sku_tmall.txt'): sku = link.strip().split('=')[1] print sku try: url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=%s&spuId=%s&sellerId=%s' %(sku,sku,sku) r = requests.get(url,headers=headers) total_ = re.search('"total":(\d+)',r.content) ttl = total_.group(0).replace('"','').replace(':','').replace('total','') writer.writerow((link.strip(),ttl)) except Exception,e: print e with open('get_com_sku_fail_tmall.txt',r'a+') as my: my.write(link.strip()+'\n') csvfile.close()
爬天猫商品评价标签“大家都写到”
#coding:utf-8 import requests,json,csv,re # # def main(): csvfile = open('res_tmall.csv','wb') write = csv.writer(csvfile) write.writerow(('link','tags')) for link in open('sku_tmall.txt'): headers = { "accept":"*/*", "accept-encoding":"gzip, deflate, br", "accept-language":"zh-CN,zh;q=0.8", "Cookie":'**', "referer":"%s"%link.strip(), "user-agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", } proid = link.strip().split('=')[1] print proid try: url = 'https://rate.tmall.com/listTagClouds.htm' payload = { 'itemId':'%s'%proid, 'isAll':'true', 'isInner':'true', 't':'1508397490861', '_ksTS':'1508397490862_1297', 'callback':'jsonp1298', } r = requests.get(url,params=payload,timeout=60,headers=headers) print r.url if r.status_code: tags = re.findall(r'"tag":"(.*?)"',r.content) write.writerow((link.strip(),'|'.join(tags))) except Exception,e: print e with open('fail.txt',r'a+') as f: f.write(link.strip()+'\n') if __name__ == '__main__': main()
获取天猫商品颜色分类个数
#coding:utf-8 from bs4 import BeautifulSoup import requests,sys,time reload(sys) sys.setdefaultencoding('utf-8') headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "cache-control": "max-age=0", "Cookie": "", "referer":"https://www.tmall.com/?spm=a220o.1000855.0.0.5c90c348cacB8C", "Upgrade-Insecure-Requests": "1", "User-Agent": "", } def main(): num = 0 for item in open('re_tmall.txt'): skunum = 0 time.sleep(0.5) try: url = 'http:'+item.strip().split('|')[1].split('&')[0] r = requests.get(url,headers=headers,timeout=60) s = BeautifulSoup(r.content,"lxml") except Exception,e: print e try: urltag = s.find('ul',attrs={"class":"tm-clear J_TSaleProp tb-img "}).find_all('li') for li in urltag: skunum += 1 except Exception,e: print e sku_list = [] try: yulan = s.find('ul',attrs={"id":"J_UlThumb"}).find_all('img') for img in yulan: imgurl = img.get('src') sku_list.append(imgurl) except Exception,e: print e try: with open('re_tmall_skunum_imgurl.txt',r'a+') as my: my.write(item.strip()+'|'+str(skunum)+'|'+",".join(sku_list)+'\n') except Exception,e: print e num += 1 print num,skunum break if __name__ == '__main__': main()
天猫列表页入参关键词,爬商品数据
#coding:utf-8 from bs4 import BeautifulSoup import requests,sys,time reload(sys) sys.setdefaultencoding('utf-8') headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "cache-control": "max-age=0", "Cookie": "", "referer":"https://www.tmall.com/?spm=a220o.1000855.0.0.5c90c348cacB8C", "Upgrade-Insecure-Requests": "1", "User-Agent": "", } #https://list.tmall.com/search_product.htm?&smAreaId=110100 #&smToken=d5ab69fafe104a119d1b68cf5dee8e2b&smSign=tynCTnJ2%2BacN%2F4nGJImRRQ%3D%3D #&smToken=&smSign= def main(): for kw in open('kw_tmall.txt'): for page in range(5): time.sleep(1) page = page * 60 url = 'https://list.tmall.com/search_product.htm' payload = { 'q':'%s'%kw.strip(), 's':'%s'%page, 'spm':'a220m.1000858.a2227oh.d100', 'sort':'d', 'smToken':'f00c864b1da14df796fd2805498c65af', 'cat':'50036568', 'style':'g', 'from':'.list.pc_1_searchbutton', 'smAreaId':'110100', 'smSign':'6eD%2BBFqGQ7CwR0gGpXh4HA%3D%3D', } r = requests.get(url,params=payload,headers=headers,timeout=60) s = BeautifulSoup(r.content,"lxml") divtag = s.find_all('div',attrs={'class':'product-iWrap'}) for at in divtag: sku = at.find('a').get('href')#商品链接 if at.find('img').get('src'):#图片延时加载,导致img标签属性不一致,需要判断 img = at.find('img').get('src') else: img = at.find('img').get('data-ks-lazyload') price = at.find('em').get('title')#价格 title = at.find('p',attrs={"class":"productTitle"}).find('a').get('title') print sku,img,price,title with open('re_tmall.txt',r'a+') as my: my.write(kw.strip()+'|'+sku+'|'+price+'|'+title+'\n') break break if __name__ == '__main__': main()
京东的也很容易,在github上,自己找吧