python爬网页+post(2)

Posted by Kelly Yang on June 24, 2019

利用python爬取POST的网页(2)

从上一篇中已经获得了html文件,需要根据其修改格式,并整理数据。

父本 子代 网页 的格式

修改

修改了数据的读入,从一个文件里读入数据,并按行传递给finddata函数,让它去爬对应数据得到的信息

修改了数据的整理,通过寻找tr确定大体位置,且根据该网页特性,发现需要的数据在第8个tr开始。再在tr中寻找td以及href(根据情况修改,有可能还会找th什么的),从而得到数据以及超链接。

得到的数据格式

举例:

珍桂矮1号
father
湘早籼17号
father
明恢63
father
[['福两优63', '早稻', '福建省农科院稻麦所', None, None, '闽审稻2000008', '120.0', None, None, None, None, None], 'varis/606473.htm']
[['两优2163', '晚稻', '福建省农业科学院', None, None, '闽审稻2000006', '130.0', None, None, None, None, None], 'varis/602856.htm']
[['泉珍10号', '早稻', '福建省泉州市农业科学研究所', None, None, '闽审稻2004002', '118.0', '458.4', '100.0', None, None, None], 'varis/601345.htm']
[['皖稻109', '晚稻', '宣城市种子公司', None, None, '皖品审03010387', '126.0', '439.6', '95.0', None, None, None], 'varis/605189.htm']

代码

#coding=utf-8

import urllib.request
from urllib.request import quote
import urllib.parse
import requests,re
import chardet
from bs4 import BeautifulSoup



def finddata(name,newfumuben):
    csdnLoginUrl='http://www.ricedata.cn/variety/default_advanced.aspx'
    postdata = {
    '__EVENTARGUMENT':'',
    '__EVENTTARGET':'',
    '__EVENTVALIDATION': '/wEWQAK9vYmxAgKjx5bgCgLg+efBCAK19e/PCAKM/pCOAQKzodKvBwKzod6vBwL55IjQCwKtwY7/AQLB36qRDgKqjoH/AgKqjqlwAsKk1SwCpY+B/wIChvvVLAKzp6pwAqulkF8Cr9+7sQwCrZO7sQwC0JG7sQwCmJ67sQwCxP+4sQwCjOK7sQwCuN24sQwCnMO4sQwCi+m7sQwCzo67sQwCwO64sQwCu4G7sQwCk6C7sQwCxey4sQwCtMW7sQwC7tK4sQwCtei4sQwC0567sQwC3JC7sQwCyte4sQwCga+7sQwCltC4sQwC+OG4sQwCtPO4sQwC2PG7sQwCx/+4sQwC8p+7sQwCysG4sQwCmJ67sQwC17C7sQwC9Zq7sQwCv6K7sQwCqp67sQwCpp27sQwC+/W7sQwChuO4sQwCwq63jAkCn/fD4g4Ch4v+9AQC2tnN/QwCutTN+AQCi5CHiwsC+4+b2w4C57Su9QoCnLr4/QoCnLq4yw8C2LmflQbPyP6D6W8FBuDBlaPWWKjtpB25yQ==',
    '__VIEWSTATEGENERATOR':'060A9241',
    '__VIEWSTATE':'/wEPDwUIMTYxNjYzOTcPZBYCAgEPZBYIZg8QDxYCHgtfIURhdGFCb3VuZGdkZGRkAgYPEA8WAh8AZ2RkZGQCBw8QDxYCHwBnZGRkZAIJDxUBiwIyMTI05p2hLCZuYnNwOzEvODXpobUmbmJzcDs8YSB0aXRsZT0n6L+U5Zue5LiL5LiA6aG1JyBzdHlsZT0nZm9udC1zaXplOjEzJyBocmVmPSdkZWZhdWx0X2FkdmFuY2VkLmFzcHg/cD0yJm9yZGVyYnk9R0JzdGFuZGFyZCZhc2M9QVNDJz7kuIvpobU8L2E+Jm5ic3A7PGEgdGl0bGU9J+i3s+iHs+acq+mhtScgc3R5bGU9J2ZvbnQtc2l6ZToxMycgaHJlZj0nZGVmYXVsdF9hZHZhbmNlZC5hc3B4P3A9ODUmb3JkZXJieT1HQnN0YW5kYXJkJmFzYz1BU0MnPuacq+mhtTwvYT5kGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBQdjaGtoZWF0inKiP5jhiBznXBVIj1xQqFTH5Fo=', 
    'ddlfiled':newfumuben,
    'varname':name,
    'cycle1':'',
    'cycle2':'',
    'txtyield':'',
    'ddlseason':'',
    'ddlarea':'',
    'quyuinfo':'',
    'quyuprov':'',
    'quyucity':'',
    'btnseek':'检索'
    }
    postdata = urllib.parse.urlencode(postdata).encode('utf-8')
    req = urllib.request.Request(
        url = csdnLoginUrl,
        data = postdata)
    #加一些头
    req.add_header('Accept', 'text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8')
    req.add_header('Accept-Language', 'zh-CN')
    req.add_header('Accept-Encoding', 'gzip, deflate')
    req.add_header('Cache-Control','max-age=0')
    req.add_header('Connection', 'Keep-Alive')
    # req.add_header('Content-Length','1842')
    req.add_header('Content-Type','application/x-www-form-urlencoded')
    req.add_header('Cookie','Hm_lpvt_1b18c49e27c0d0ba21d123ac61f37650=1561359522; Hm_lvt_1b18c49e27c0d0ba21d123ac61f37650=1561346703,1561353815,1561354799,1561354895; ASP.NET_SessionId=llgxqa55viqel5vqjsfncbfz; acw_tc=781bad0a15604835210482664e61e4735ad94400e3d7ce3229bfa08500b411; bdshare_firstime=1560483632891')
    req.add_header('Host','www.ricedata.cn')
    req.add_header('Referer','http://www.ricedata.cn/variety/default_advanced.aspx')
    req.add_header('Upgrade-Insecure-Requests','1')
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134')
    #open login url
    r = urllib.request.urlopen(req)
    html = r.read()
    newhtml=html.decode('utf-8')
    #newdata=open("C:\\Users\\ykl\\Documents\\博士期间文档\\水稻育种\\2.txt",'w+') 
    print(name,file=newdata)
    print(newfumuben,file=newdata)
    soup=BeautifulSoup(newhtml, 'lxml')
    trs=soup.find_all('tr')
    td=[]
    linkData=[]
    zhengli=[]
    counttr=0
    for k in trs:
        counttr=counttr+1
        if counttr>7:
            d=k.find_all('td')
            for page in d:
                links=page.select('a')
                for each in links:
            # if str(each.get('href'))[:1] == '/': 过滤if代码
                    data=each.get('href')
            linkData.append(data)

            if d:
                td.append(d)
            for j in range(len(d)):
                str=d[j].string
                d[j]=str
            zhengli.append(d)
            zhengli.append(data)
            print(zhengli,file=newdata)
            zhengli.clear()

newdata=open("C:\\Users\\ykl\\Documents\\博士期间文档\\水稻育种\\allpinzhong2father.txt",'w+') 
pinzhong=open("C:\\Users\\ykl\\Documents\\博士期间文档\\水稻育种\\allpinzhong2.txt",'r',encoding='UTF-8')
names=pinzhong.readlines()
count=0
for line in names:
    line = line.strip('\n')
    fumuben='father'
    #fumuben='mother'
    finddata(line,fumuben)
    print("输出了",count)
    count=count+1
pinzhong.close()
newdata.close()