使用python爬取国内油价数据

    技术2023-07-23  105

    目标:

    爬取 http://youjia.chemcp.com/ 国内油价数据,并存入数据库,存入数据库的表名以 oil+当天日期命名。

    分析过程

    1.chrome浏览器输入网址,查看网页的源代码 2.观察所需爬取的数据在源代码中的位置,发现所需数据被包围在table内

    以<table width="100%" border="0" cellpadding="4" cellspacing="1" bgcolor="#B6CCE4">开头 以</table>结尾

    3.搜索<table width="100%" border="0" cellpadding="4" cellspacing="1" bgcolor="#B6CCE4">,发现只有一处匹配,符合爬取条件 4.获取到table内的数据后,再获取table内每个<tr></tr>内的数据 5.通过循环获取<tr></tr>内每个<td></td>内的数据,保存为一个列表 6.最后存入数据库即可 ###方法说明:

    commom.mysql_common.py:使用python操作mysql数据库的方法封装; common.table_name.py:命名数据库表名; push_data_to_mysql.py:获取网站数据,存入数据库

    全部代码

    commom.mysql_common.py

    # -*- coding: utf-8 -*- from pymysql import * import pymysql class Mysql: def __init__(self, host, port, user, password, db, charset='utf8'): self.host = host self.port = port self.db = db self.user = user self.password = password self.charset = charset def connectsql(self): self.conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.password, db=self.db, charset=self.charset) self.cursor = self.conn.cursor(pymysql.cursors.DictCursor) def closesql(self): self.cursor.close() self.conn.close() def execute(self, sql): try: self.connectsql() self.cursor.execute(sql) data = self.cursor.fetchall() print(data) self.conn.commit() self.closesql() except Exception as e: print(e)

    common.table_name.py

    # -*- coding: utf-8 -*- import time import re # 表名称 def table_name(): date_time = time.strftime("%Y-%m-%d", time.localtime()) data = re.split(r'-', date_time) name = "oil" + data[0] + data[1] + data[2] return name

    push_data_to_mysql.py

    get_html_text:获取网站的文本数据 parse_data:对html数据进行解析,获取到所需数据存入一个列表并返回 create:mysql创建一个表格,用来存入数据(表格名为 oil+当天年月日) insert:将数据插入创建的表格
    # -*- coding: utf-8 -*- import requests import re from common.mysql_common import Mysql from common.table_name import table_name class PushOilDataToMysql: def __init__(self): self.html = self.get_html_text() self.oil_data = self.parse_data() self.name = table_name() self.do_mysql = Mysql('localhost', 3306, 'root', '123456', 'myoildata') @staticmethod def get_html_text(): url = 'http://youjia.chemcp.com/' try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" def parse_data(self): oil_data = [] try: # 获取<table开头到</table>结尾的数据 # 匹配规则, re.S 是因为“.”的作用是匹配除“\n”以外的任何字符, 而在html中有很多"\n" rule_table = '<table width="100%" border="0" cellpadding="4" cellspacing="1" bgcolor="#B6CCE4">\r\n(.*?)</table>' html_data = re.findall(rule_table, self.html, re.S) # 获取<tr>开头到</tr>结尾的数据,所有城市的内容 # 匹配规则 rule_tr = '<tr>\r\n(.*?)</tr>' citys_data = re.findall(rule_tr, html_data[0], re.S) # data_list 为匹配的城市数据 for i in range(1, len(citys_data)): city_data = citys_data[i] # 匹配规则 rule_city_detail = '<td bgcolor="#FFFFFF">(.*?)</td>' city_detail_data = re.findall(rule_city_detail, city_data, re.S) area = city_detail_data[0].split('>')[1].split("<")[0] oil89 = eval(city_detail_data[1]) oil92 = eval(city_detail_data[2]) oil95 = eval(city_detail_data[3]) oil98 = eval(city_detail_data[4]) oil0 = eval(city_detail_data[5]) update_time = city_detail_data[6] oil_data.append((area, oil89, oil92, oil95, oil98, oil0, update_time)) except: print("获取油价数据失败") return oil_data def create(self): sql_create_table = 'create table `%s`(id int auto_increment,`area` varchar(20),`oil89` varchar(20),`oil92` varchar(20),`oil95` varchar(20),`oil98` varchar(20), `oil0` varchar(20), `update_time` varchar(255), primary key(id))' % self.name self.do_mysql.execute(sql_create_table) def insert(self): for i in range(len(self.oil_data)): sql_insert = 'insert into %s(area, oil89, oil92, oil95, oil98, oil0, update_time) value%s' % (self.name, self.oil_data[i]) self.do_mysql.execute(sql_insert) if __name__ == '__main__': test = PushOilDataToMysql() test.create() test.insert()
    Processed: 0.008, SQL: 9