python抓取ajax接口返回的数据（python爬虫获取的ajax数据）

利用python来对ajax数据爬取

以下例子说明

爬取数据一:
思路: 1.通过分析url 2 构造请求url 3,抓取数据 4 写入到文档,5 结省时间,使用多线程来写,
问题1: 使用多线程会不会有写的冲突?此例子中不会冲突,通过分析接口数据,可以得到总条数,总页数,每页都用一个线程来执行,组装数据后,再用另一个线程来写入文件
import requests,json
from urllib.parse import urlencode
from threading import Thread
def get_page(offset):
    headers={
        'Host': 'www.xx.cn',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
    'Accept': '*/*',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'gzip, deflate',
    'X-Requested-With': 'XMLHttpRequest',
    'Connection': 'keep-alive',
    'Referer': 'http://www.xxx.cn/plist.html'
    }
    url='http://www.xxxx.cn/plist.html?page='+str(offset)
    try:
        res=requests.get(url=url,headers=headers)
        return  res.json()
    except Exception as e:
        print(e)
        return  None
#组装数据
#定义全局列表
res_data=[]
def write_file(data):
    # print(data)
    if data is not None:
        data_list=data['data']
        global  res_data
        for item in range(len(data_list)):
            dict_res={}
            dict_res['name']=data_list[item]['g_name']
            dict_res['url']=data_list[item]['g_img']
            # print(data_list[item]['g_name'],data_list[item]['g_img'])
            res_data.append(dict_res)
#通过另一个线程写文件
def write_json():
    with open('data.txt','w',encoding='utf-8') as f:
        json.dump(res_data,f)
    print('write ok')
def run(page):
    data=get_page(page)
    write_file(data)
json_data=get_page(1)
if __name__ == '__main__':
    per_page=get_page(1)
    total_page=per_page['last_page'] #总页数
    thread_list=[]
    for i in range(total_page):
        thread=Thread(target=run,args=(i,))
        thread.start()
?
        thread_list.append(thread)
    #线程阻塞
    for thread in thread_list:
        thread.join()
#开启另一个线程写数据
    thread2=Thread(target=write_json)
    thread2.start()
?
print('ok')
?

案例二:爬取头条美女

import requests
from urllib.parse import urlencode
from hashlib import md5
import os
import time
from multiprocessing.pool import Pool #利用进程池来快速下载
def get_page(offset):
    params={
        "aid":"24",
        "app_name":"web_search",
        "offset":offset,
        "format":"json",
        "keyword":"街拍",
        "autoload":"true",
        "count":20,
        "en_qc":1,
        "cur_tab":1,
        "from":"search_tab",
        "pd":"synthesis",
        "timestamp":"1589868205095",
        "_signature":"Jd--SAAgEBCicjqlJx1oXSXe.1AAHskpOqtHpIydeeXvI8cKTz2Viav8TZqOW9OkhxJBbNwd0QlzXThsKYBWvhtWjp.kb098c2ERJVTL4rZ83Gm.t-HibQGmRwFDHPMX1Ag",
    }
    headers={
    "Host": "www.xxxx.com",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
    "Accept": "application/json, text/javascript",
    "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Accept-Encoding": "gzip, deflate, br",
    "X-Requested-With": "XMLHttpRequest",
    "Content-Type": "application/x-www-form-urlencoded",
    "Connection": "keep-alive",
    "Referer": "https://www.xxxx.com/search/?keyword=%E8%A1%97%E6%8B%8D",
    "Cookie": "passport_auth_status=4e2c956d9c382caa634e918d5172b8ff%2C0f4b06ebf9e3e318c84a0eecc9442dd5; sso_auth_status=ff08b117e07ebf4300d482cf5cc30d45; sso_uid_tt=6fea73d4793f1b79f59f5a617feaff22; sso_uid_tt_ss=6fea73d4793f1b79f59f5a617feaff22; toutiao_sso_user=39705416627edcab9c45e2b65d2d0c9f; toutiao_sso_user_ss=39705416627edcab9c45e2b65d2d0c9f; sid_guard=b85bcaa805f8516561135b4e14496c92%7C1589791339%7C5184000%7CFri%2C+17-Jul-2020+08%3A42%3A19+GMT; uid_tt=91c92c7b35228506a3625e502b21cb72; uid_tt_ss=91c92c7b35228506a3625e502b21cb72; sid_tt=b85bcaa805f8516561135b4e14496c92; sessionid=b85bcaa805f8516561135b4e14496c92; sessionid_ss=b85bcaa805f8516561135b4e14496c92; tt_webid=6828431109119641095; s_v_web_id=verify_kadidh7u_sioNDvyI_0Rzv_4LHk_8f9u_vaPraRxYLEFs; ttcid=70273172efd6499da7b5d97c6007317527; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __tasessionId=nbz73rlzz1589868028446; SLARDAR_WEB_ID=5bd84172-c5b7-4465-a2a2-647092a758fc; tt_webid=6828431109119641095; csrftoken=bbb8a94b2d289864849a2349ab12ea6d; tt_scid=vktEovYN4sRQ3ObjgWSfbhNk0rK5qwxPELQgXo-SXRE-M9RNyOb-OTz1Dr8Ju2emb318",
    "TE": "Trailers"
}
    url="https://www.xxxxx.com/api/search/content?"+urlencode(params)
    try:
        response=requests.get(url=url,headers=headers)
        return response.json()
    except Exception as e:
        return None
?
#获取图片连接
def get_img(json_data):
    if json_data.get('data'):
        for item in json_data.get('data'):
            title=item.get('title')
            image_list=item.get('image_list')
            print(title)
            for image in image_list:
                yield {
                    'title':title,
                    'img_url':image.get('url')
                }
#保存图片
def save_img(dict_img):
    #没有文件夹,先创建文件夹
    if not os.path.exists(dict_img.get('title')):
        os.mkdir(dict_img.get('title'))
?
    try:
        res=requests.get(dict_img.get('img_url'))
        print(res)
        if res.status_code==200:
            file_path='{0}/{1}.{2}'.format(dict_img.get('title'),md5(res.content).hexdigest(),'jpg')
            if not os.path.exists(file_path):
                with open(file_path,'wb') as f:
                    f.write(res.content)
            else:
                print('already download',file_path)
    except Exception as e:
        print(e)
?
def main(offset):
    json=get_page(offset)
    for item in get_img(json):
        save_img(item)
?
group_start=1
group_end=20
if __name__ == '__main__':
    pool=Pool()
    groups=([x*20 for x in range(group_start,group_end+1)])
    pool.map(main,groups)
    pool.close()
    pool.join()
?
?

网站首页 > 技术文章正文

猜你喜欢

网站首页 > 技术文章 正文

python抓取ajax接口返回的数据（python爬虫获取的ajax数据）

猜你喜欢

网站首页 > 技术文章正文