利用python爬取POST的网页
目的
为了获取特定型号水稻作为父母本的子代信息。
待爬取的网页
http://www.ricedata.cn/variety/default_advanced.aspx
介绍:可以在左上一栏选择品种,如父本、母本、然后填入品种
资源介绍
右键查看源,点击网络,点击页面上的检查,选择名称,可以看到标头以及正文。
这些信息可以作为我们爬取时的重要参数填入。
可以利用postman来查看,超级好用
代码:
#coding=utf-8
import urllib.request
from urllib.request import quote
import urllib.parse
import requests,re
import chardet
###以下两行是为了检查编码问题做的测试###
# s="\xe5\xb9\xb4\xe4\xbb\xbd"
# print(s.encode('gbk'))
###以下三行是为了检测出编码问题(用chardet)###
print (chardet.detect(b'\xe4\xba\xa9\xe4\xba\xa7\xef\xbc\x88\xe5\x85\xac\xe6\x96\xa4\xef\xbc\x89\xe2\x89\xa5'))
t= b'\xe4\xba\xa9\xe4\xba\xa7\xef\xbc\x88\xe5\x85\xac\xe6\x96\xa4\xef\xbc\x89\xe2\x89\xa5:Index.php'
print (t.decode('utf-8'))
###下面是开始正式爬的过程###
csdnLoginUrl='http://www.ricedata.cn/variety/default_advanced.aspx'
#上面是URL信息,由于是copy的代码,这个名字可以忽略
#post data,这个是正文部分的信息
postdata = {
'__EVENTARGUMENT':'',
'__EVENTTARGET':'',
'__EVENTVALIDATION': '/wEWQAK9vYmxAgKjx5bgCgLg+efBCAK19e/PCAKM/pCOAQKzodKvBwKzod6vBwL55IjQCwKtwY7/AQLB36qRDgKqjoH/AgKqjqlwAsKk1SwCpY+B/wIChvvVLAKzp6pwAqulkF8Cr9+7sQwCrZO7sQwC0JG7sQwCmJ67sQwCxP+4sQwCjOK7sQwCuN24sQwCnMO4sQwCi+m7sQwCzo67sQwCwO64sQwCu4G7sQwCk6C7sQwCxey4sQwCtMW7sQwC7tK4sQwCtei4sQwC0567sQwC3JC7sQwCyte4sQwCga+7sQwCltC4sQwC+OG4sQwCtPO4sQwC2PG7sQwCx/+4sQwC8p+7sQwCysG4sQwCmJ67sQwC17C7sQwC9Zq7sQwCv6K7sQwCqp67sQwCpp27sQwC+/W7sQwChuO4sQwCwq63jAkCn/fD4g4Ch4v+9AQC2tnN/QwCutTN+AQCi5CHiwsC+4+b2w4C57Su9QoCnLr4/QoCnLq4yw8C2LmflQbPyP6D6W8FBuDBlaPWWKjtpB25yQ==',
'__VIEWSTATEGENERATOR':'060A9241',
'__VIEWSTATE':'/wEPDwUIMTYxNjYzOTcPZBYCAgEPZBYIZg8QDxYCHgtfIURhdGFCb3VuZGdkZGRkAgYPEA8WAh8AZ2RkZGQCBw8QDxYCHwBnZGRkZAIJDxUBiwIyMTI05p2hLCZuYnNwOzEvODXpobUmbmJzcDs8YSB0aXRsZT0n6L+U5Zue5LiL5LiA6aG1JyBzdHlsZT0nZm9udC1zaXplOjEzJyBocmVmPSdkZWZhdWx0X2FkdmFuY2VkLmFzcHg/cD0yJm9yZGVyYnk9R0JzdGFuZGFyZCZhc2M9QVNDJz7kuIvpobU8L2E+Jm5ic3A7PGEgdGl0bGU9J+i3s+iHs+acq+mhtScgc3R5bGU9J2ZvbnQtc2l6ZToxMycgaHJlZj0nZGVmYXVsdF9hZHZhbmNlZC5hc3B4P3A9ODUmb3JkZXJieT1HQnN0YW5kYXJkJmFzYz1BU0MnPuacq+mhtTwvYT5kGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBQdjaGtoZWF0inKiP5jhiBznXBVIj1xQqFTH5Fo=',
'ddlfiled':'father',
'varname':'明恢63',
'cycle1':'',
'cycle2':'',
'txtyield':'',
'ddlseason':'',
'ddlarea':'',
'quyuinfo':'',
'quyuprov':'',
'quyucity':'',
'btnseek':'检索'
}
#post data编码一下,输出一下看是否正确
postdata = urllib.parse.urlencode(postdata).encode('utf-8')
print(postdata)
#创建一个req
req = urllib.request.Request(
url = csdnLoginUrl,
data = postdata)
#加一些头
req.add_header('Accept', 'text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8')
req.add_header('Accept-Language', 'zh-CN')
req.add_header('Accept-Encoding', 'gzip, deflate')
req.add_header('Cache-Control','max-age=0')
req.add_header('Connection', 'Keep-Alive')
# req.add_header('Content-Length','1842')
req.add_header('Content-Type','application/x-www-form-urlencoded')
req.add_header('Cookie','Hm_lpvt_1b18c49e27c0d0ba21d123ac61f37650=1561359522; Hm_lvt_1b18c49e27c0d0ba21d123ac61f37650=1561346703,1561353815,1561354799,1561354895; ASP.NET_SessionId=llgxqa55viqel5vqjsfncbfz; acw_tc=781bad0a15604835210482664e61e4735ad94400e3d7ce3229bfa08500b411; bdshare_firstime=1560483632891')
req.add_header('Host','www.ricedata.cn')
req.add_header('Referer','http://www.ricedata.cn/variety/default_advanced.aspx')
req.add_header('Upgrade-Insecure-Requests','1')
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134')
#open login url
r = urllib.request.urlopen(req)
html = r.read()
# 下面是decode一下爬出来的东西,为了避免中文错误翻译
newhtml=html.decode('utf-8')
# 下面是打开一个txt然后输入进去
newdata=open("C:\\Users\\ykl\\Documents\\博士期间文档\\水稻育种\\2.txt",'w+')
print(newhtml,file=newdata)
newdata.close()
致谢
非常感谢阿昭和涛涛的帮助哇!!哈哈哈哈哈哈开心!涛涛快来武汉,去阿昭家约饭呀!!!!