Python爬虫实战,requests模块,Python实现爬取自主品牌汽车下
上一篇的推文讲了三家自主品牌车企,上汽、长安、吉利。接下来讲讲剩下的七家,东风、北汽、长城、一汽、广汽、江淮。
Python腾讯大牛直播预约:
4. 东风
从下面东风的车可以看出,大部分都是商用车,了解一下东风小康。其实现在的东风启辰,也是东风日产旗下的品牌,一款假国产车(网评)。
import os
import requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://www.dfac.com/'
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
Car_Type = ['', 'Light_truck', 'Engineering_truck', 'Light_guest', 'Coach', '', 'School_bus', 'Pick-up_Truck']
for i in range(8):
if i == 0:
pass
elif i == 5:
pass
else:
folder_path = "F:/Car/DFAC/" + Car_Type[i] + "/"
os.makedirs(folder_path)
result = html.xpath('//div[@class="Procon pc' + str(i) + '"]//img/@src')
for j in range(len(result)):
url = 'http://www.dfac.com' + result[j]
r = requests.get(url)
picture_name = url.replace('http://www.dfac.com/uploadfiles/', '').replace('2018/10/', '').replace('2018/09/', '')
with open('F:\\Car\\DFAC\\' + Car_Type[i] + '\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }
5. 北汽
北汽底下的自主品牌有北汽坤宝,北京越野。当然还有北汽福田,以及福田旗下的宝沃汽车,那个来自欧洲,与BBA齐名的豪华品牌,不过现在日子也是难过。
import os
import json
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://www.baicmotorsales.com/jeesite/web/car/queryCarSeries'
response = requests.get(url=url, headers=headers)
data = json.loads(response.text)
Car_Type = ['SUV', 'Car', 'ORV']
Car = ['s', 'j', 'y']
for i in range(3):
folder_path = "F:/Car/BAIC_Motor/" + Car_Type[i] + "/"
os.makedirs(folder_path)
str = Car[i] + 'list'
for item in data['body'][str]:
url = 'http://www.baicmotorsales.com' + item.get('modelPicPc')
r = requests.get(url)
picture_name = url.replace('http://www.baicmotorsales.com/upload/userfiles/1/files/car/contrastCarModel/', '').replace('2018/05/', '').replace('2018/09/', '')
with open('F:\\Car\\BAIC_Motor\\' + Car_Type[i] + '\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }
6. 长城
长城,SUV引领者。一款SUV哈弗H6,已经累计65个月SUV销量冠军,可以与秋名山车神一战啦。如今的WEY品牌也是很给力,带着自主品牌又迈上的一个台阶。
长城
import os
import re
import requests
import demjson
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://www.gwm.com.cn/statics/gwm-cn/js/carModel.js'
response = requests.get(url=url, headers=headers)
res = response.text[45:-3]
message = res.replace('\r','').replace('\n','').replace('\t','')
result = re.findall('(.*?);_CIACarTypeAttributeSet', message)
data = demjson.decode(result[0])
folder_path = "F:/Car/Great_Wall_Automobile/Great_Wall/"
os.makedirs(folder_path)
for item in data:
try:
url = 'http://www.gwm.com.cn' + item['Pics']['F']
except:
continue
r = requests.get(url)
picture_name = url[-20:]
with open('F:\\Car\\Great_Wall_Automobile\\Great_Wall\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }
哈弗
import os
import requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://www.haval.com.cn/photo/index.shtml'
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
result = html.xpath('//ul[@class="photoCont-list clearfix"]//img/@src')
folder_path = "F:/Car/Great_Wall_Automobile/Haval/"
os.makedirs(folder_path)
for i in range(13):
if i == 0:
url = 'http:' + result[i]
else:
url = result[i]
r = requests.get(url)
picture_name = url[-13:]
with open('F:\\Car\\Great_Wall_Automobile\\Haval\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }
WEY
import os
import requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'https://www.wey.com/'
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
result = html.xpath('//div[@class="new-car-box col-xs-12 col-sm-6 col-lg-3"]//img/@src')
folder_path = "F:/Car/Great_Wall_Automobile/Wey/"
os.makedirs(folder_path)
for i in range(len(result)):
url = 'https://www.wey.com/' + result[i]
r = requests.get(url)
picture_name = url.replace('https://www.wey.com/home/img/home/', '')
with open('F:\\Car\\Great_Wall_Automobile\\Wey\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }
7. 一汽
就在上个月,一汽与16家银行签署了战略合作协议,协议中各银行给一汽意向性授信共计10150亿元。乖乖的,这个老大哥胃口真是大,怎么有点像当初振兴东北的政策一般,会不会又打shuipiao~
红旗
import os
import requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://www.faw-hongqi.com.cn/'
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
result = html.xpath('//div[@class="logo_t logo_t1"]//img/@src')
folder_path = "F:/Car/FAW/Hongqi/"
os.makedirs(folder_path)
for i in range(len(result)):
url = 'http://www.faw-hongqi.com.cn/' + result[i]
r = requests.get(url)
picture_name = url.replace('http://www.faw-hongqi.com.cn//pcs/images/top_', '')
with open('F:\\Car\\FAW\\Hongqi\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }
奔腾
import os
import requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://www.fawcar.com.cn/products/public/car_bt.html'
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
result = html.xpath('//dl[@class="imglist"]//img/@src')
Car_Type = ['Car', 'SUV']
for i in range(2):
folder_path = "F:/Car/FAW/Bt/" + Car_Type[i] + "/"
os.makedirs(folder_path)
for j in range(11):
url = 'http://www.fawcar.com.cn/products/' + result[j]
r = requests.get(url)
picture_name = url.replace('http://www.fawcar.com.cn/products/images/', '')
if j in [0, 3, 4, 5, 6, 10]:
with open('F:\\Car\\FAW\\Bt\\Car\\' + picture_name, 'wb') as f:
f.write(r.content)
else:
with open('F:\\Car\\FAW\\Bt\\SUV\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }
8. 奇瑞
奇瑞QQ,一款风靡全国的乘用车。曾经乘用车的一哥,现如今先是将车设计的还不错但是就是卖不出去的观致拱手给了宝能。不过给了宝能之后,销量上去了,大写的尬(就算是内销也是一种本事)。最后再到自己也不得不面临出售的局面。
import re
import os
import urllib
import requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://www.chery.cn/'
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
result = html.xpath('//div[@class="wrapper"]//img/@src')
Car_Type = ['Car', 'SUV']
for i in range(2):
folder_path = "F:/Car/Chery/" + Car_Type[i] + "/"
os.makedirs(folder_path)
for i in range(len(result)):
url_handle = 'http://www.chery.cn' + result[i]
cn_name = re.findall('\d/(.*?).png', url_handle)
name_second = urllib.parse.quote(cn_name[0])
name_first = result[i].replace(cn_name[0], '').replace('.png', '')
url = 'http://www.chery.cn' + name_first + name_second + '.png'
r = requests.get(url)
picture_name = url.replace('http://www.chery.cn/media/', '').replace('/', '')
if i < 6:
with open('F:\\Car\\Chery\\SUV\\' + picture_name, 'wb') as f:
f.write(r.content)
else:
with open('F:\\Car\\Chery\\Car\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }
9. 广汽
广汽发展的比较晚,但是现在势头很好,尤其是广汽传祺。业界都说离不开广东人的务实,思想开放。确实,看看广东各地市常年的GDP就知道了,遍地开花。
import os
import requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
for j in ['car', 'suv', 'mpv']:
folder_path = "F:/Car/Gac_Motor/" + j + "/"
os.makedirs(folder_path)
url = 'https://www.gacmotor.com/Home/Models/' + j + '.html'
response = requests.get(url=url, headers=headers, verify=False)
html = etree.HTML(response.text)
result = html.xpath('//div[@class="moNav"]//img/@src')
ids = list(set(result))
for i in range(len(ids)):
url = 'https://www.gacmotor.com' + ids[i]
r = requests.get(url, headers=headers, verify=False)
picture_name = url.replace('https://www.gacmotor.com/Public/Home/img/', '')
with open('F:\\Car\\Gac_Motor\\' + j + '\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }
10. 江淮
江淮,较上面的这些品牌,之前江淮应该是唯一一家没有合资品牌的国有车企。但是现在也与大众合资起来了新能源汽车,确实市场就摆在那里,不多学习,何来市场。
import os
import requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://www.jac.com.cn/jacweb/procenter/'
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
Car_Type = ['Refine', 'MPV', 'Light_truck', 'Heavy_truck']
for i in range(4):
folder_path = "F:/Car/JAC/" + Car_Type[i] + "/"
os.makedirs(folder_path)
ID = ['98', '74', '144', '75', '76', '77', '78', '985']
for i in ID:
if i in ['98', '74']:
result = html.xpath('//div[contains(@class, "brandWordList") and @id="floor' + i + '"]//div[@class="brandWordLeft"]//img/@src')
for j in range(len(result)):
url = 'http://www.jac.com.cn' + result[j]
r = requests.get(url)
picture_name = url.replace('http://www.jac.com.cn/u/cms/www/', '').replace('/', '')
with open('F:\\Car\\JAC\\Refine\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
elif i == '144':
result = html.xpath('//div[contains(@class, "brandWordList") and @id="floor' + i + '"]//div[@class="brandWordLeft"]//img/@src')
for j in range(len(result)):
url = 'http://www.jac.com.cn' + result[j]
r = requests.get(url)
picture_name = url.replace('http://www.jac.com.cn/u/cms/www/', '').replace('/', '')
with open('F:\\Car\\JAC\\MPV\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
elif i in ['75', '76', '77']:
result = html.xpath('//div[contains(@class, "brandWordList") and @id="floor' + i + '"]//div[@class="brandWordLeft"]//img/@src')
for j in range(len(result)):
url = 'http://www.jac.com.cn' + result[j]
r = requests.get(url)
picture_name = url.replace('http://www.jac.com.cn/u/cms/www/', '').replace('/', '')
with open('F:\\Car\\JAC\\Light_truck\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
else:
result = html.xpath('//div[contains(@class, "brandWordList") and @id="floor' + i + '"]//div[@class="brandWordLeft"]//img/@src')
for j in range(len(result)):
url = 'http://www.jac.com.cn' + result[j]
r = requests.get(url)
picture_name = url.replace('http://www.jac.com.cn/u/cms/www/', '').replace('/', '')
with open('F:\\Car\\JAC\\Heavy_truck\\' + picture_name, 'wb') as f:
f.write(r.content)
f.close()
print(url)
{ 左右滑动切换图片 }