有的时候,自己做百度的主动推送也挺方便(主要是研发那边已经被seo需求排满了,自己能做的东西就尽量不占用研发资源。)
考虑到以下几点就可以了
1、怎么只拿到昨天发布的数据,即url;
2、每次推送限制2000条;
3、如果有M端页面,分开推;
#coding:utf-8 import requests,time,re,os import sys reload(sys) sys.setdefaultencoding('utf-8') def main(): # 删掉yesterday文件 # if os.path.exists('yesterday.txt'): # os.remove('yesterday.txt') #把xml中的数据拿下来,并和现有的数据去重后,留下的数据单独放到一个文件,并且追加到所有的url txt里 url = 'http://yp.jd.com/00/00_0.xml' r = requests.get(url) zhishi_url = re.findall(r'<loc>(.*?)</loc>',r.content) has_push_list = [url.strip() for url in open('all_url.txt')] f = open('all_url.txt',r'a+')#所有的url f_ytd = open('yesterday_0.txt',r'w+')#昨天发布的文章url f_ytd_m = open('yesterday_m_0.txt',r'w+')#昨天发布的文章url(m) num = 0 txt_index = 0 for link in zhishi_url:#多 if link in has_push_list: pass else: f.write(link+'\n')#追加到所有的url txt里 f_ytd.write(link+'\n')#把还未推送的url放到单独的文件内 f_ytd_m.write(link.replace('www','m')+'\n')#把还未推送的url放到单独的文件内(m) if num%2000 == 1999: f_ytd.close() txt_index += 1 f_ytd = open('yesterday_%s.txt'%txt_index,r'w+') f_ytd_m = open('yesterday_m_%s.txt'%txt_index,r'w+') num += 1 f.close() f_ytd.close() f_ytd_m.close() print 'yesterday has %s'%num print 'crawl done' time.sleep(5) #开始推送 print 'push begin' for i in range(0,txt_index+1): try: headers = {'Content-Type':'text/plain'} url = 'http://data.zz.baidu.com/urls' params = {'site':'www.jd.com','token':'00'}#,'type':'original' r = requests.post(url,params=params,headers=headers,data=open('yesterday_%s.txt'%i,r'rb').read()) #m params_m = {'site':'m.jd.com','token':'00'}#,'type':'original' r_m = requests.post(url,params=params_m,headers=headers,data=open('yesterday_m_ %s.txt'%i,r'rb').read()) print 'PC:'+r.content+','+'M:'+r_m.content except Exception,e: print e continue print 'Finish!!!' if __name__ == '__main__': while True: current_time = time.localtime(time.time()) if((current_time.tm_hour == 18) and (current_time.tm_min == 0) and (current_time.tm_sec == 0)): main()