利用python爬取POST的网页(2)
从上一篇中已经获得了html文件,需要根据其修改格式,并整理数据。
父本 子代 网页 的格式
修改
修改了数据的读入,从一个文件里读入数据,并按行传递给finddata函数,让它去爬对应数据得到的信息
修改了数据的整理,通过寻找tr确定大体位置,且根据该网页特性,发现需要的数据在第8个tr开始。再在tr中寻找td以及href(根据情况修改,有可能还会找th什么的),从而得到数据以及超链接。
得到的数据格式
举例:
珍桂矮1号
father
湘早籼17号
father
明恢63
father
[['福两优63', '早稻', '福建省农科院稻麦所', None, None, '闽审稻2000008', '120.0', None, None, None, None, None], 'varis/606473.htm']
[['两优2163', '晚稻', '福建省农业科学院', None, None, '闽审稻2000006', '130.0', None, None, None, None, None], 'varis/602856.htm']
[['泉珍10号', '早稻', '福建省泉州市农业科学研究所', None, None, '闽审稻2004002', '118.0', '458.4', '100.0', None, None, None], 'varis/601345.htm']
[['皖稻109', '晚稻', '宣城市种子公司', None, None, '皖品审03010387', '126.0', '439.6', '95.0', None, None, None], 'varis/605189.htm']
代码
#coding=utf-8
import urllib.request
from urllib.request import quote
import urllib.parse
import requests,re
import chardet
from bs4 import BeautifulSoup
def finddata(name,newfumuben):
csdnLoginUrl='http://www.ricedata.cn/variety/default_advanced.aspx'
postdata = {
'__EVENTARGUMENT':'',
'__EVENTTARGET':'',
'__EVENTVALIDATION': '/wEWQAK9vYmxAgKjx5bgCgLg+efBCAK19e/PCAKM/pCOAQKzodKvBwKzod6vBwL55IjQCwKtwY7/AQLB36qRDgKqjoH/AgKqjqlwAsKk1SwCpY+B/wIChvvVLAKzp6pwAqulkF8Cr9+7sQwCrZO7sQwC0JG7sQwCmJ67sQwCxP+4sQwCjOK7sQwCuN24sQwCnMO4sQwCi+m7sQwCzo67sQwCwO64sQwCu4G7sQwCk6C7sQwCxey4sQwCtMW7sQwC7tK4sQwCtei4sQwC0567sQwC3JC7sQwCyte4sQwCga+7sQwCltC4sQwC+OG4sQwCtPO4sQwC2PG7sQwCx/+4sQwC8p+7sQwCysG4sQwCmJ67sQwC17C7sQwC9Zq7sQwCv6K7sQwCqp67sQwCpp27sQwC+/W7sQwChuO4sQwCwq63jAkCn/fD4g4Ch4v+9AQC2tnN/QwCutTN+AQCi5CHiwsC+4+b2w4C57Su9QoCnLr4/QoCnLq4yw8C2LmflQbPyP6D6W8FBuDBlaPWWKjtpB25yQ==',
'__VIEWSTATEGENERATOR':'060A9241',
'__VIEWSTATE':'/wEPDwUIMTYxNjYzOTcPZBYCAgEPZBYIZg8QDxYCHgtfIURhdGFCb3VuZGdkZGRkAgYPEA8WAh8AZ2RkZGQCBw8QDxYCHwBnZGRkZAIJDxUBiwIyMTI05p2hLCZuYnNwOzEvODXpobUmbmJzcDs8YSB0aXRsZT0n6L+U5Zue5LiL5LiA6aG1JyBzdHlsZT0nZm9udC1zaXplOjEzJyBocmVmPSdkZWZhdWx0X2FkdmFuY2VkLmFzcHg/cD0yJm9yZGVyYnk9R0JzdGFuZGFyZCZhc2M9QVNDJz7kuIvpobU8L2E+Jm5ic3A7PGEgdGl0bGU9J+i3s+iHs+acq+mhtScgc3R5bGU9J2ZvbnQtc2l6ZToxMycgaHJlZj0nZGVmYXVsdF9hZHZhbmNlZC5hc3B4P3A9ODUmb3JkZXJieT1HQnN0YW5kYXJkJmFzYz1BU0MnPuacq+mhtTwvYT5kGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBQdjaGtoZWF0inKiP5jhiBznXBVIj1xQqFTH5Fo=',
'ddlfiled':newfumuben,
'varname':name,
'cycle1':'',
'cycle2':'',
'txtyield':'',
'ddlseason':'',
'ddlarea':'',
'quyuinfo':'',
'quyuprov':'',
'quyucity':'',
'btnseek':'检索'
}
postdata = urllib.parse.urlencode(postdata).encode('utf-8')
req = urllib.request.Request(
url = csdnLoginUrl,
data = postdata)
#加一些头
req.add_header('Accept', 'text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8')
req.add_header('Accept-Language', 'zh-CN')
req.add_header('Accept-Encoding', 'gzip, deflate')
req.add_header('Cache-Control','max-age=0')
req.add_header('Connection', 'Keep-Alive')
# req.add_header('Content-Length','1842')
req.add_header('Content-Type','application/x-www-form-urlencoded')
req.add_header('Cookie','Hm_lpvt_1b18c49e27c0d0ba21d123ac61f37650=1561359522; Hm_lvt_1b18c49e27c0d0ba21d123ac61f37650=1561346703,1561353815,1561354799,1561354895; ASP.NET_SessionId=llgxqa55viqel5vqjsfncbfz; acw_tc=781bad0a15604835210482664e61e4735ad94400e3d7ce3229bfa08500b411; bdshare_firstime=1560483632891')
req.add_header('Host','www.ricedata.cn')
req.add_header('Referer','http://www.ricedata.cn/variety/default_advanced.aspx')
req.add_header('Upgrade-Insecure-Requests','1')
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134')
#open login url
r = urllib.request.urlopen(req)
html = r.read()
newhtml=html.decode('utf-8')
#newdata=open("C:\\Users\\ykl\\Documents\\博士期间文档\\水稻育种\\2.txt",'w+')
print(name,file=newdata)
print(newfumuben,file=newdata)
soup=BeautifulSoup(newhtml, 'lxml')
trs=soup.find_all('tr')
td=[]
linkData=[]
zhengli=[]
counttr=0
for k in trs:
counttr=counttr+1
if counttr>7:
d=k.find_all('td')
for page in d:
links=page.select('a')
for each in links:
# if str(each.get('href'))[:1] == '/': 过滤if代码
data=each.get('href')
linkData.append(data)
if d:
td.append(d)
for j in range(len(d)):
str=d[j].string
d[j]=str
zhengli.append(d)
zhengli.append(data)
print(zhengli,file=newdata)
zhengli.clear()
newdata=open("C:\\Users\\ykl\\Documents\\博士期间文档\\水稻育种\\allpinzhong2father.txt",'w+')
pinzhong=open("C:\\Users\\ykl\\Documents\\博士期间文档\\水稻育种\\allpinzhong2.txt",'r',encoding='UTF-8')
names=pinzhong.readlines()
count=0
for line in names:
line = line.strip('\n')
fumuben='father'
#fumuben='mother'
finddata(line,fumuben)
print("输出了",count)
count=count+1
pinzhong.close()
newdata.close()