我的dcd爬虫-Python

admin2024-05-15  2

我自己写的dcd爬虫,这个网站比较简单。看了看别人的程序,觉得用起来挺别扭,就自己捣鼓了一天。弄出来了。

这个网站没有反爬,有一些是动态网页,有一些是静态。

首先,获取销量排行榜前300的车型。

import os
import json
import requests
from parsel import Selector


# ---------------------------------------------------------#
# ----           * 获得车辆销售排行榜前300、100的车         *        ----#
# ---------------------------------------------------------#


url = "https://www.dongchedi.com/motor/pc/car/rank_data"
headers = {
       "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
    }


def get_param(page):
    params = {
        "aid": "1839",
        "app_name": "auto_web_pc",
        "city_name": "烟台",
        "count": "10",
        "offset": page,
        "month": "",
        "new_energy_type": "",
        "rank_data_type": "11",
        "brand_id": "",
        "price": "",
        "manufacturer": "",
        "outter_detail_type": "",
        "nation": "0"
    }
    return params


def get_response(pageNum):
    params = get_param(str(pageNum * 10))

    with requests.get(url=url, headers=headers, params=params, verify=False) as resp:
        resp.raise_for_status()
        print(resp.status_code)
    return resp


data_list = []
for i in range(30):
    print(f"销量前{i * 10} 的车")
    response = get_response(i)
    data_list.append(response.json())

获取之后,就能访问该车型,一般一个车型有好多款式,我的目的是向比较一些车型的尺寸,所以一个车型就选第一种款式,访问进入该车型第一种款式的参数配置,这样把参数下载下来,放到一个文件里,就可以比较现在卖的车的尺寸情况。

第二部分,我尝试了一下动态请求车型的价格。不过这一部分后面数据分析没有用到。


len(data_list)
import jsonpath
data_list[0]['data']['list'][0]['series_name']

name_list = jsonpath.jsonpath(data_list, "$..series_name")
id_list = jsonpath.jsonpath(data_list, "$..series_id")
id_list
first_list = jsonpath.jsonpath(data_list, "$..online_car_ids")

first_list[0][0]

car_id_list = []
for ls in first_list:
    if ls:
        first_id = ls[0]
    else:
        first_id = None
    car_id_list.append(first_id)
len(car_id_list)

import pandas as pd
df = pd.DataFrame({
    "name": name_list,
    "series": id_list,
    "first_id":car_id_list
})

df

df[df['first_id'] == None]

df2 = df.dropna()
df.shape
df2.shape

df2.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")
df = pd.read_csv("Pythn-Anlys-138/dcd/top300cars.csv")
df.keys()
df.columns
df.columns = ['rank', 'name', 'series', 'first_id']
df.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")

# ---------------------------------------------------------#
# ----           * 价格         *        ----#
# ---------------------------------------------------------#

first_id

def get_price(car_id):
    import json
    import os
    wk_dir = "Pythn-Anlys-138/dcd"
    # fpath = wk_dir + "/" + car_id + ".csv"
    fname = car_id + ".json"
    url = "https://www.dongchedi.com/motor/pc/car/series/car_dealer_price"
    headers = {
                  。。。
                      }
    params = {
        "aid": "1839",
        "app_name": "auto_web_pc",
        "car_ids": car_id,
        "city_name": "烟台"
    }

    with requests.get(url=url, headers=headers, params=params, verify=False) as resp:
        resp.raise_for_status()
        # print(resp.json())
        rj = resp.json()
    with open(os.path.join(wk_dir, fname), 'w',  encoding="utf-8") as f:
        f.write(json.dumps(rj, ensure_ascii=False))
        print(f"保存文件成功 {car_id} !!!")

first_id = str(first_id)

get_price(first_id)

这一部分呢后期没什么用,代码也很乱。

第三部分,获取某一车型的第一种款式的参数。


# ---------------------------------------------------------#
# ----           * 参数配置        *        ----#
# ---------------------------------------------------------#

from parsel import Selector

def get_detail_page(id):
    url = "https://www.dongchedi.com/auto/params-carIds-" + id
    headers = {
。。。
}

    with requests.get(url=url, headers=headers, verify=False) as resp:
        resp.raise_for_status()
        # print(resp.text)
    return resp.text

html = get_detail_page(id)

html

selector = Selector(html)

selector.css('div[data-row-anchor]')
len(selector.css('div[data-row-anchor]'))

all_rows = selector.css('div[data-row-anchor]')

dct_list = []
for row in all_rows:
    dct_item = {}
    label = row.css('div:nth-child(1) label::text').get()
    value = row.css('div:nth-child(2) div::text').get()
    dct_item[label] = value
    dct_list.append(dct_item)

dct_list

first_row = all_rows[0]

def parse_detail(id):
    html = get_detail_page(id)
    selector = Selector(html)
    all_rows = selector.css('div[data-row-anchor]')
    dct_list = []
    for row in all_rows:
        dct_item = {}
        label = row.css('div:nth-child(1) label::text').get()
        value = row.css('div:nth-child(2) div::text').get()
        dct_item[label] = value
        dct_list.append(dct_item)

    dct_detail = {
        "id":id,
        "detail":dct_list
    }
    return dct_detail

dct_detail = parse_detail(id)
dct_detail


first_id_list

def save_detail(id, dct_detail):
    fname = id + "_dcd_detail.json"
    with open(os.path.join("Pythn-Anlys-138/dcd", fname), 'w', encoding='utf8') as f:
        f.write(json.dumps(dct_detail, ensure_ascii=False))
        print(f"Detail file {id}  saved!!!")

for fid in first_id_list:
    dct_detail = parse_detail(fid)
    save_detail(fid, dct_detail)


最后,下载了一些json文件。后期做了一些数据整理。做成了数据表是这样的。

我的dcd爬虫-Python,第1张

 结果还不错。

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明原文出处。如若内容造成侵权/违法违规/事实不符,请联系SD编程学习网:675289112@qq.com进行投诉反馈,一经查实,立即删除!