博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python丨Selenium爬取拉钩职位信息
阅读量:2069 次
发布时间:2019-04-29

本文共 5206 字,大约阅读时间需要 17 分钟。

爬取第一页的职位信息

第一页职位信息

from selenium import webdriverfrom lxml import etreeimport reimport time'''想要学习Python?Python学习交流群:973783996满足你的需求,资料都已经上传群文件,可以自行下载!'''class LagouSpider(object):    def __init__(self):        self.driver = webdriver.Chrome()        #python职位        self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='        self.position = []    def run(self):        self.driver.get(self.url)        source = self.driver.page_source        self.parse_list_page(source)    def parse_list_page(self,source):        html = etree.HTML(source)        links = html.xpath("//a[@class='position_link']/@href")        #每一页的所有职位的详情url        for link in links:            self.request_detail_page(link)            time.sleep(1)    def request_detail_page(self,url):        self.driver.get(url)        #获取职位详情页的源代码        source = self.driver.page_source        self.parse_detail_page(source)    def parse_detail_page(self,source):        html = etree.HTML(source)        position_name = html.xpath("//span[@class='name']/text()")[0]        job_request_spans = html.xpath("//dd[@class='job_request']//span")        salary = job_request_spans[0].xpath('.//text()')[0].strip()        city = job_request_spans[1].xpath('.//text()')[0].strip()        city = re.sub(r"[\s/]","",city)        work_years = job_request_spans[2].xpath('.//text()')[0].strip()        work_years = re.sub(r"[\s/]","",work_years)        education = job_request_spans[3].xpath('.//text()')[0].strip()        education = re.sub(r"[\s/]","",education)        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()        position = {            'name':position_name,            'salary':salary,            'city': city,            'work_years': work_years,            'education': education,            'desc': desc,        }        self.position.append(position)        print(position)        print('-'*200)if __name__ == '__main__':    spider = LagouSpider()    spider.run()

爬取所有页的职位信息

from selenium import webdriverfrom lxml import etreeimport reimport timefrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.common.by import By'''想要学习Python?Python学习交流群:973783996满足你的需求,资料都已经上传群文件,可以自行下载!'''class LagouSpider(object):    def __init__(self):        self.driver = webdriver.Chrome()        #python职位        self.url = 'https://www.lagou.com/jobs/list_python?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput='        self.position = []    def run(self):        self.driver.get(self.url)        while True:            source = self.driver.page_source            WebDriverWait(driver=self.driver,timeout=20).until(                EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]"))            )            self.parse_list_page(source)            #点“下一页”            next_btn = self.driver.find_element_by_xpath(                "//div[@class='pager_container']/span[last()]")            if "pager_next_disabled" in next_btn.get_attribute("class"):                break            else:                next_btn.click()            time.sleep(1)    def parse_list_page(self,source):        html = etree.HTML(source)        links = html.xpath("//a[@class='position_link']/@href")        #每一页的所有职位的详情url        for link in links:            self.request_detail_page(link)            time.sleep(1)    def request_detail_page(self,url):        # self.driver.get(url)        self.driver.execute_script("window.open('%s')"%url)        self.driver.switch_to.window(self.driver.window_handles[1])        WebDriverWait(driver=self.driver,timeout=20).until(            EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']"))        )        #获取职位详情页的源代码        source = self.driver.page_source        self.parse_detail_page(source)        #关闭当前详情页,并且切换到列表页        self.driver.close()        self.driver.switch_to.window(self.driver.window_handles[0])    def parse_detail_page(self,source):        html = etree.HTML(source)        position_name = html.xpath("//span[@class='name']/text()")[0]        job_request_spans = html.xpath("//dd[@class='job_request']//span")        salary = job_request_spans[0].xpath('.//text()')[0].strip()        city = job_request_spans[1].xpath('.//text()')[0].strip()        city = re.sub(r"[\s/]","",city)        work_years = job_request_spans[2].xpath('.//text()')[0].strip()        work_years = re.sub(r"[\s/]","",work_years)        education = job_request_spans[3].xpath('.//text()')[0].strip()        education = re.sub(r"[\s/]","",education)        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()        company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()        position = {            'name':position_name,            'company_name':company_name,            'salary':salary,            'city': city,            'work_years': work_years,            'education': education,            'desc': desc,        }        self.position.append(position)        print(position)        print('-'*200)if __name__ == '__main__':    spider = LagouSpider()    spider.run()

 

转载地址:http://vinmf.baihongyu.com/

你可能感兴趣的文章
HttpClient get和HttpClient Post请求的方式获取服务器的返回数据
查看>>
net.sf.json Maven依赖配置
查看>>
Could not initialize class net.sf.json.JsonConfig错误解决
查看>>
Java编程思想重点笔记(Java开发必看)
查看>>
eclipse 创建maven 项目 动态web工程完整示例
查看>>
前端JSP与Spring MVC交互实用例子
查看>>
使用maven一步一步构建spring mvc项目
查看>>
hadoop map reduce 阶段笔记
查看>>
java jackcess 操作 access
查看>>
Git问题Everything up-to-date解决
查看>>
Hadoop HDFS文件操作的Java代码
查看>>
Hadoop学习笔记—3.Hadoop RPC机制的使用
查看>>
Hadoop学习笔记—22.Hadoop2.x环境搭建与配置
查看>>
JTS Geometry关系判断和分析
查看>>
GIS基本概念
查看>>
Java文件操作①——XML文件的读取
查看>>
java学习总结之文件操作--ByteArrayOutputStream的用法
查看>>
Java生成和操作Excel文件
查看>>
Java的三种代理模式
查看>>
java静态代理与动态代理简单分析
查看>>