python丨Selenium爬取拉钩职位信息-白红宇

python丨Selenium爬取拉钩职位信息

阅读量：2069 次

发布时间：2019-04-29

本文共 5206 字，大约阅读时间需要 17 分钟。

爬取第一页的职位信息

第一页职位信息

from selenium import webdriverfrom lxml import etreeimport reimport time'''想要学习Python？Python学习交流群：973783996满足你的需求，资料都已经上传群文件，可以自行下载！'''class LagouSpider(object):    def __init__(self):        self.driver = webdriver.Chrome()        #python职位        self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='        self.position = []    def run(self):        self.driver.get(self.url)        source = self.driver.page_source        self.parse_list_page(source)    def parse_list_page(self,source):        html = etree.HTML(source)        links = html.xpath("//a[@class='position_link']/@href")        #每一页的所有职位的详情url        for link in links:            self.request_detail_page(link)            time.sleep(1)    def request_detail_page(self,url):        self.driver.get(url)        #获取职位详情页的源代码        source = self.driver.page_source        self.parse_detail_page(source)    def parse_detail_page(self,source):        html = etree.HTML(source)        position_name = html.xpath("//span[@class='name']/text()")[0]        job_request_spans = html.xpath("//dd[@class='job_request']//span")        salary = job_request_spans[0].xpath('.//text()')[0].strip()        city = job_request_spans[1].xpath('.//text()')[0].strip()        city = re.sub(r"[\s/]","",city)        work_years = job_request_spans[2].xpath('.//text()')[0].strip()        work_years = re.sub(r"[\s/]","",work_years)        education = job_request_spans[3].xpath('.//text()')[0].strip()        education = re.sub(r"[\s/]","",education)        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()        position = {            'name':position_name,            'salary':salary,            'city': city,            'work_years': work_years,            'education': education,            'desc': desc,        }        self.position.append(position)        print(position)        print('-'*200)if __name__ == '__main__':    spider = LagouSpider()    spider.run()

爬取所有页的职位信息

from selenium import webdriverfrom lxml import etreeimport reimport timefrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.common.by import By'''想要学习Python？Python学习交流群：973783996满足你的需求，资料都已经上传群文件，可以自行下载！'''class LagouSpider(object):    def __init__(self):        self.driver = webdriver.Chrome()        #python职位        self.url = 'https://www.lagou.com/jobs/list_python?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput='        self.position = []    def run(self):        self.driver.get(self.url)        while True:            source = self.driver.page_source            WebDriverWait(driver=self.driver,timeout=20).until(                EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]"))            )            self.parse_list_page(source)            #点“下一页”            next_btn = self.driver.find_element_by_xpath(                "//div[@class='pager_container']/span[last()]")            if "pager_next_disabled" in next_btn.get_attribute("class"):                break            else:                next_btn.click()            time.sleep(1)    def parse_list_page(self,source):        html = etree.HTML(source)        links = html.xpath("//a[@class='position_link']/@href")        #每一页的所有职位的详情url        for link in links:            self.request_detail_page(link)            time.sleep(1)    def request_detail_page(self,url):        # self.driver.get(url)        self.driver.execute_script("window.open('%s')"%url)        self.driver.switch_to.window(self.driver.window_handles[1])        WebDriverWait(driver=self.driver,timeout=20).until(            EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']"))        )        #获取职位详情页的源代码        source = self.driver.page_source        self.parse_detail_page(source)        #关闭当前详情页，并且切换到列表页        self.driver.close()        self.driver.switch_to.window(self.driver.window_handles[0])    def parse_detail_page(self,source):        html = etree.HTML(source)        position_name = html.xpath("//span[@class='name']/text()")[0]        job_request_spans = html.xpath("//dd[@class='job_request']//span")        salary = job_request_spans[0].xpath('.//text()')[0].strip()        city = job_request_spans[1].xpath('.//text()')[0].strip()        city = re.sub(r"[\s/]","",city)        work_years = job_request_spans[2].xpath('.//text()')[0].strip()        work_years = re.sub(r"[\s/]","",work_years)        education = job_request_spans[3].xpath('.//text()')[0].strip()        education = re.sub(r"[\s/]","",education)        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()        company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()        position = {            'name':position_name,            'company_name':company_name,            'salary':salary,            'city': city,            'work_years': work_years,            'education': education,            'desc': desc,        }        self.position.append(position)        print(position)        print('-'*200)if __name__ == '__main__':    spider = LagouSpider()    spider.run()

转载地址：http://vinmf.baihongyu.com/

你可能感兴趣的文章

HttpClient get和HttpClient Post请求的方式获取服务器的返回数据

查看>>

net.sf.json Maven依赖配置

查看>>

Could not initialize class net.sf.json.JsonConfig错误解决

查看>>

Java编程思想重点笔记（Java开发必看）

查看>>

eclipse 创建maven 项目动态web工程完整示例

查看>>