新浪新闻数据爬取与清洗 基本要求:完成新浪新闻排行中文章的数据爬取,包括标题、媒体、时间、内容。 进阶要求:对最近一周出现次数最多的关键字排名并进行可视化显示。 新浪新闻数据管理系统 基本要求:完成新浪新闻的展示功能,同时提供删除和查询功能,查询可以按照题目、时间、题目和内容关键字进行查询;完成用户的登录、注册、退出功能。 进阶要求:完成数据导出excel功能、分页功能。
运行代码之前,先在数据库创建一个如下所示的表格(库名:xinlangxinwen)
# Ciyun.py import jieba import wordcloud class Ciyun: def __init__(self, tablena, txt): excluds = {'目前', '没有', '责任编辑', '标题', '他们', '已经'} words = jieba.lcut(txt) counts = {} for word in words: if len(word) == 1: continue else: rword = word counts[rword] = counts.get(rword, 0) + 1 for word in excluds: del counts[word] items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) text = '' for i in range(20): word, count = items[i] text = text + ' ' + word w = wordcloud.WordCloud(background_color='white', font_path='C/Windows/Font/simkai.ttf') w.generate(text) w.to_file('D:/python object/test/新浪新闻爬取/Ciyun/' + tablena + '.png') # Excel.py from openpyxl import Workbook class Excel: def __init__(self, tablena, list): wb = Workbook() ws = wb["Sheet"] ws['A1'] = 'idnum' ws['B1'] = 'title' ws['C1'] = 'media' ws['D1'] = 'date' ws['E1'] = 'text' for i in range(2, len(list) + 2): for j in range(1, 6): ws.cell(row=i, column=j).value = str(list[i - 2][j - 1]) # 写单元格 wb.save('D:/python object/test/新浪新闻爬取/Excel/' + tablena + '.xlsx') # getsHTML.py import re import requests from bs4 import BeautifulSoup class getsHTML: # 获取页面的HTML def getHTML(self, url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except Exception as e: print('wrong! ' + e) # 提取每一份排名的js子链接 def getJS(self, string, divid): so = BeautifulSoup(string, 'html.parser') Js = so.find(attrs={'id': divid}) jsurl = Js.script.attrs['src'] return jsurl # 获取分享数排行的js子连接 def getfenxiang(self, string, divid): so = BeautifulSoup(string, 'html.parser') Js = so.find(attrs={'id': divid}) i = 0 for k in Js.find_all('script'): i += 1 if len(Js.find_all('script')) == i: jsurl = k.attrs['src'] return jsurl # 提取js中的列表 def gettitle(self, jshtml): pat = re.compile("\[\{.*?\}\]") data = pat.findall(jshtml) return data[0] # 提取新闻内容 def gettext(self, xinwen): jsurl = '' so = BeautifulSoup(xinwen, 'html.parser') mt = so.find(attrs={'name': 'mediaid'}).attrs['content'] for k in so.find_all('p'): neirong = k.string if neirong is None: continue if neirong[:5] == '电话:40' or neirong[:8] == '“掌”握科技鲜闻' or neirong[:9] == '24小时滚动播报最' or neirong[ :8] == '更多娱乐八卦、明' or neirong[ :6] == '更多猛料!欢' or neirong[ :11] == 'Copyright ©': break jsurl += neirong return mt, jsurl # 提取图片内容 def getPicture(self, title, picurl): idnum = 0 text = '' so = BeautifulSoup(picurl, 'html.parser') Js = so.find(attrs={'id': 'eData'}) for k in Js.find_all('dl'): idnum += 1 picU = k.find('dd').string houzhui = picU[-4:] pwd = 'D:/python object/test/新浪新闻爬取/Picture/' + title + str(idnum) + houzhui with open(pwd, 'wb') as f: r = requests.get(picU) f.write(r.content) f.close() text += pwd + '; ' return text # main.py import getsHTML import mySql import zhuce import TKinter import os import shutil import tkinter as tk from tkinter import messagebox class main: def mains(self, html, tablena, div): if div[4] == '3': jsurl = self.gH.getfenxiang(html, div) else: jsurl = self.gH.getJS(html, div) jshtml = self.gH.getHTML(jsurl) jslist = self.gH.gettitle(jshtml) jslist = jslist.replace('false', '""') # 'Con92'可能会出现false,防止程序出错,去除(为防止其它也出现所以全部检验一遍) lists = eval(jslist) for i in range(len(lists)): xinwenurl = lists[i].get('url') xinwenurls = eval(repr(xinwenurl).replace('\\', '')) # 去除转义符 xinwen = self.gH.getHTML(xinwenurls) try: media, txt = self.gH.gettext(xinwen) except: continue if txt is None: continue txt = txt.replace(' ', '') # 去除文章中的空格 title = lists[i].get('title') date = lists[i].get('create_date') self.idnum += 1 self.mS.inSql(tablena, self.idnum, title, media, date, txt) print(self.idnum) def maines(self, html, tablena, div): shutil.rmtree('D:/python object/test/新浪新闻爬取/Picture') # 清除老旧图片,强制删除文件夹 os.mkdir('D:/python object/test/新浪新闻爬取/Picture') # 重新创建文件夹 jsurl = self.gH.getJS(html, div) jshtml = self.gH.getHTML(jsurl) jslist = self.gH.gettitle(jshtml) jslist = jslist.replace('false', '""') # 'Con92'可能会出现false,防止程序出错,去除(为防止其它也出现所以全部检验一遍) lists = eval(jslist) for i in range(len(lists)): xinwenurl = lists[i].get('url') xinwenurls = eval(repr(xinwenurl).replace('\\', '')) # 去除转义符 xinwen = self.gH.getHTML(xinwenurls) title = lists[i].get('title') date = lists[i].get('create_date') media = lists[i].get('media') try: txt = self.gH.getPicture(title, xinwen) except: continue if txt is None: continue self.idnum += 1 self.mS.inSql(tablena, self.idnum, title, media, date, txt) print(self.idnum) def main(self): self.idnum = 0 tableName = { 'Con11': '点击量总排行', 'Con12': '评论数总排行', 'Con15': '图片总排行', 'Con21': '国内新闻点击量', 'Con22': '国内新闻评论数', 'Con23': '国内新闻分享数', 'Con31': '国际新闻点击量', 'Con32': '国际新闻评论数', 'Con33': '国际新闻分享数', 'Con41': '社会新闻点击量', 'Con42': '社会新闻评论数', 'Con43': '社会新闻分享数', 'Con51': '体育新闻点击量', 'Con52': '体育新闻评论数', 'Con53': '体育新闻分享数', 'Con61': '科技新闻点击量', 'Con62': '科技新闻评论数', 'Con63': '科技新闻分享数', 'Con71': '财经新闻点击量', 'Con72': '财经新闻评论数', 'Con73': '财经新闻分享数', 'Con81': '娱乐新闻点击量', 'Con82': '娱乐新闻评论数', 'Con83': '娱乐新闻分享数', 'Con91': '军事新闻点击量', 'Con92': '军事新闻评论数', 'Con93': '军事新闻分享数' } list_se = self.mS.useSql(self.name.get()) if list_se == (): messagebox.showerror(title='爬取失败', message='用户名不存在') elif list_se[0][0] == self.passwd.get(): YN = messagebox.askokcancel('警告', '爬虫会覆盖原有数据,在提示完成前,请勿进行操作!') # 是/否,返回值yes/no if YN: html = self.gH.getHTML(r'http://news.sina.com.cn/hotnews/') for i in range(1, 10): for j in range(1, 4): if i == 1 and j == 3: continue else: tablena = tableName.get('Con' + str(i) + str(j)) self.mS.crSql(tablena) self.mains(html, tablena, 'Con' + str(i) + str(j)) tablena = tableName.get('Con15') self.mS.crSql(tablena) self.maines(html, tablena, 'Con15') messagebox.showinfo(title='成功', message='爬取完成') else: messagebox.showerror(title='爬取失败', message='密码错误') def wins(self): self.gH = getsHTML.getsHTML() self.mS = mySql.mySql() self.zc = zhuce.zhuce() self.TK = TKinter.TKinter() login = tk.Tk() login.title('用户登录') login.geometry('210x180+100+100') tk.Label(login, text='用户登录').grid(row=0, column=0, columnspan=2, pady=5) tk.Label(login, text='用户名:').grid(row=1, column=0, pady=5) self.name = tk.Entry(login) self.name.grid(row=1, column=1, pady=5) tk.Label(login, text='密码:').grid(row=2, column=0, sticky=tk.E, pady=5) self.passwd = tk.Entry(login, show='*') self.passwd.grid(row=2, column=1, pady=5) tk.Button(login, text='登录', command=self.logins).grid(row=3, column=1, pady=5) tk.Button(login, text='注册', command=self.zhuce).grid(row=3, column=0, pady=5) tk.Button(login, text='注销', command=self.zhuxiao).grid(row=4, column=0, pady=5) tk.Button(login, text='爬虫', command=self.main).grid(row=4, column=1, pady=5) login.mainloop() def logins(self): list_se = self.mS.useSql(self.name.get()) if list_se == (): messagebox.showerror(title='登录失败', message='用户名不存在') elif list_se[0][0] == self.passwd.get(): self.TK.wins() else: messagebox.showerror(title='登录失败', message='密码错误') def zhuce(self): self.zc.wins() def zhuxiao(self): list_se = self.mS.useSql(self.name.get()) if list_se == (): messagebox.showerror(title='注销失败', message='用户名不存在') elif list_se[0][0] == self.passwd.get(): YN = messagebox.askokcancel('警告', '注销后将无法找回') # 是/否,返回值yes/no if YN: self.mS.deUse(self.name.get()) messagebox.showinfo(title='成功', message='注销成功') else: messagebox.showerror(title='注销失败', message='密码错误') ma = main() ma.wins() # mySql.py import pymysql import time from tkinter import messagebox import Ciyun import Excel class mySql: # 建立连接 def __init__(self): self.connection = pymysql.connect('localhost', 'root', '123456', 'xinlangxinwen') self.cursor = self.connection.cursor() # 建立数据库表格 def crSql(self, tablena): sql = 'create table ' + tablena + '(idnum int not null, title varchar(50) not null, media varchar(50), date date, text text )' self.cursor.execute('use xinlangxinwen') self.cursor.execute('DROP TABLE IF EXISTS ' + tablena) self.cursor.execute(sql) # 数据导入数据库 def inSql(self, tablena, idnum, title, media, date, text): title = title.replace('"', r'\"') media = media.replace('"', r'\"') text = text.replace('"', r'\"') # 去除双引号对输入的影响 insert = 'insert into ' + tablena + ' values ("' + str( idnum) + '","' + title + '","' + media + '","' + date + '","' + text + '")' try: time.strptime(date, "%Y-%m-%d") except: messagebox.showerror(title='错误', message='日期格式不正确') try: self.cursor.execute(insert) self.connection.commit() except: pass # 1406, "Data too long for column 'text' at row 1" # 从数据库爬取新闻题目 def outSql(self, tablena, title, media, date, text, page): select = 'select title from ' + tablena + ' where title like "%' + title + '%" and media like "%' + media + '%" and date like "%' + date + '%" and text like "%' + text + '%" limit ' + str( page) + ',20' self.cursor.execute(select) list_se = self.cursor.fetchall() return list_se # 查询新闻的详细信息 def selSql(self, tablena, title): select = 'select * from ' + tablena + ' where title = "' + title + '"' self.cursor.execute(select) list_se = self.cursor.fetchall() return list_se # 更新新闻信息 def upSql(self, tablena, idnum, title, media, date, text): sql = 'update ' + tablena + ' set title = "' + title + '", media = "' + media + '", date = "' + date + '", text="' + text + '" where idnum = ' + str( idnum) self.cursor.execute(sql) self.connection.commit() # 删除新闻 def delSql(self, tablena, idnum): sql = 'delete from ' + tablena + ' where idnum = ' + str(idnum) self.cursor.execute(sql) self.connection.commit() # 用户登录验证 def useSql(self, name): select = 'select password from user where name= "' + name + '"' self.cursor.execute(select) list_se = self.cursor.fetchall() return list_se # 用户注册 def crUse(self, name, passwd): insert = 'insert into user values ("' + name + '" , "' + passwd + '")' self.cursor.execute(insert) self.connection.commit() # 用户注销 def deUse(self, name): sql = 'delete from user where name = "' + name + '"' self.cursor.execute(sql) self.connection.commit() # 读取内容,生成词云 def Ciyun(self, tablena): txt = '' select = 'select text from ' + tablena self.cursor.execute(select) list_se = self.cursor.fetchall() for i in list_se: txt += str(i) Ciyun.Ciyun(tablena, txt) # 读取数据库全部数据,用于导出到Excel def Excel(self, tablena): select = 'select * from ' + tablena self.cursor.execute(select) list_se = self.cursor.fetchall() Excel.Excel(tablena, list_se) # 析构函数,关闭通道 def __del__(self): self.cursor.close() self.connection.close() # TKinter.py import mySql import xiangxi import time import tkinter as tk from tkinter import ttk from tkinter import messagebox class TKinter: def wins(self): self.idnum = 0 self.win = tk.Tk() self.win.title('新闻总览') self.win.geometry('500x700+350+50') tk.Label(self.win, text='新闻类别:').grid(row=0, column=0, pady=5, sticky=tk.E) self.cmb = ttk.Combobox(self.win, state='readonly') # 设置新闻类别的下拉框 self.cmb.grid(row=0, column=1, sticky=tk.W) self.cmb['value'] = ( '点击量总排行', '评论数总排行', '图片总排行', '国内新闻点击量', '国内新闻评论数', '国内新闻分享数', '国际新闻点击量', '国际新闻评论数', '国际新闻分享数', '社会新闻点击量', '社会新闻评论数', '社会新闻分享数', '体育新闻点击量', '体育新闻评论数', '体育新闻分享数', '科技新闻点击量', '科技新闻评论数', '科技新闻分享数', '财经新闻点击量', '财经新闻评论数', '财经新闻分享数', '娱乐新闻点击量', '娱乐新闻评论数', '娱乐新闻分享数', '军事新闻点击量', '军事新闻评论数', '军事新闻分享数') self.cmb.current(0) self.mS = mySql.mySql() tk.Label(self.win, text='标题:').grid(row=1, column=0, pady=5, sticky=tk.E) self.titleEntry = tk.Entry(self.win, width=40) self.titleEntry.grid(row=1, column=1, pady=5, sticky=tk.W) tk.Label(self.win, text='媒体:').grid(row=2, column=0, pady=5, sticky=tk.E) self.mediaEntry = tk.Entry(self.win, width=40) self.mediaEntry.grid(row=2, column=1, pady=5, sticky=tk.W) tk.Label(self.win, text='时间:').grid(row=3, column=0, pady=5, sticky=tk.E) self.dateEntry = tk.Entry(self.win, width=40) self.dateEntry.insert(tk.END, 'YYYY-MM-DD') self.dateEntry.grid(row=3, column=1, pady=5, sticky=tk.W) self.dateEntry.bind("<Button-1>", self.clear) # 设定点击事件,点击文本框清除默认值 tk.Label(self.win, text='内容:').grid(row=4, column=0, pady=5, sticky=tk.E) self.txtEntry = tk.Entry(self.win, width=40) self.txtEntry.grid(row=4, column=1, pady=5, sticky=tk.W) self.creatButton = tk.Button(self.win, text='添加', width=10, height=1, command=self.Creat) self.creatButton.grid(row=5, column=0, pady=5) self.selectButton = tk.Button(self.win, text='查询', width=10, height=1, command=self.shouye) self.selectButton.grid(row=5, column=1, pady=5) self.selectButton = tk.Button(self.win, text='词云', width=10, height=1, command=self.ciyun) self.selectButton.grid(row=6, column=0, pady=5) self.selectButton = tk.Button(self.win, text='导出', width=10, height=1, command=self.daochu) self.selectButton.grid(row=6, column=1, pady=5) self.listBox = tk.Listbox(self.win, width=70, height=20) self.listBox.grid(row=7, column=0, columnspan=2, pady=5) self.listBox.bind('<<ListboxSelect>>', self.click) # 绑定点击事件 self.pageEntry = tk.Entry(self.win, width=5) self.pageEntry.grid(row=8, column=0, columnspan=2, pady=5) self.selectButton = tk.Button(self.win, text='上一页', width=10, height=1, command=self.last) self.selectButton.grid(row=9, column=0) self.selectButton = tk.Button(self.win, text='下一页', width=10, height=1, command=self.next) self.selectButton.grid(row=9, column=1) self.win.mainloop() def clear(self, event): if self.dateEntry.get() == 'YYYY-MM-DD': self.dateEntry.delete(0, tk.END) def Creat(self): self.idnum -= 1 self.tablena = self.cmb.get() title = self.titleEntry.get() media = self.mediaEntry.get() date = self.dateEntry.get() txt = self.txtEntry.get() if date == 'YYYY-MM-DD': date = '' else: try: time.strptime(date, "%Y-%m-%d") except: messagebox.showerror(title='错误', message='日期格式不正确') return if title == '' or media == '' or date == '' or txt == '': messagebox.showerror(title='失败', message='添加信息有空值') return self.mS.inSql(self.tablena, self.idnum, title, media, date, txt) messagebox.showinfo(title='成功', message='添加成功') # 查询按钮调用方法,重置页面为首页 def shouye(self): self.page = 0 self.Select() self.pageEntry.delete(0, tk.END) self.pageEntry.insert(0, str(int(self.page / 20 + 1))) # 上一页 def last(self): self.page -= 20 if self.page < 0: self.page = 0 self.Select() self.pageEntry.delete(0, tk.END) self.pageEntry.insert(0, str(int(self.page / 20 + 1))) # 下一页 def next(self): self.page += 20 self.Select() self.pageEntry.delete(0, tk.END) self.pageEntry.insert(0, str(int(self.page / 20 + 1))) def Select(self): self.tablena = self.cmb.get() self.listBox.delete(0, tk.END) title = self.titleEntry.get() media = self.mediaEntry.get() date = self.dateEntry.get() if date == 'YYYY-MM-DD' or date == '': date = '' else: try: time.strptime(date, "%Y-%m-%d") except: messagebox.showerror(title='错误', message='日期格式不正确') return txt = self.txtEntry.get() list_se = self.mS.outSql(self.tablena, title, media, date, txt, self.page) for i in list_se: i = str(i)[2:-3] self.listBox.insert(tk.END, i) def ciyun(self): self.mS.Ciyun(self.tablena) messagebox.showinfo(title='成功', message='词云导出在Ciyun目录中') def daochu(self): self.tablena = self.cmb.get() self.mS.Excel(self.tablena) messagebox.showinfo(title='成功', message='表格导出在Excel目录中') def click(self, event): w = event.widget title = w.get(w.curselection()) # 获取鼠标选中的标题 xx = xiangxi.xiangxi(self.tablena, title) xx.wins() # xiangxi.py import mySql import time import tkinter as tk from tkinter import messagebox class xiangxi: def __init__(self, tablena, title): self.tablena = tablena self.mS = mySql.mySql() self.list_se = self.mS.selSql(tablena, title) def wins(self): self.win = tk.Tk() self.win.title('详细信息') self.win.geometry('650x500+870+150') tk.Label(self.win, text='标题:').grid(row=0, column=0, pady=5, sticky=tk.E) self.titleEntry = tk.Entry(self.win, width=40) self.titleEntry.grid(row=0, column=1, pady=5, sticky=tk.W) self.titleEntry.insert(tk.END, self.list_se[0][1]) tk.Label(self.win, text='媒体:').grid(row=1, column=0, pady=5, sticky=tk.E) self.mediaEntry = tk.Entry(self.win, width=40) self.mediaEntry.grid(row=1, column=1, pady=5, sticky=tk.W) self.mediaEntry.insert(tk.END, self.list_se[0][2]) tk.Label(self.win, text='时间:').grid(row=2, column=0, pady=5, sticky=tk.E) self.dateEntry = tk.Entry(self.win, width=40) self.dateEntry.grid(row=2, column=1, pady=5, sticky=tk.W) self.dateEntry.insert(tk.END, self.list_se[0][3]) tk.Label(self.win, text='内容:').grid(row=3, column=0, pady=5, sticky=tk.E) self.txtText = tk.Text(self.win) self.txtText.grid(row=3, column=1, pady=5, sticky=tk.W) self.txtText.insert(tk.END, self.list_se[0][4]) self.updatetButton = tk.Button(self.win, text='修改', width=10, height=1, command=self.Update) self.updatetButton.grid(row=4, column=0, columnspan=2, pady=5) self.deleteButton = tk.Button(self.win, text='删除', width=10, height=1, command=self.Delete) self.deleteButton.grid(row=4, column=1, columnspan=2, pady=5, sticky=tk.W) self.win.mainloop() def Update(self): title = self.titleEntry.get() media = self.mediaEntry.get() date = self.dateEntry.get() try: time.strptime(date, "%Y-%m-%d") except: messagebox.showerror(title='错误', message='日期格式不正确') return text = self.txtText.get('1.0', tk.END) self.mS.upSql(self.tablena, self.list_se[0][0], title, media, date, text) messagebox.showinfo(title='成功', message='修改成功') def Delete(self): YN = messagebox.askokcancel('警告', '删除后信息无法恢复') # 是/否,返回值yes/no if YN: self.mS.delSql(self.tablena, self.list_se[0][0]) messagebox.showinfo(title='成功', message='删除成功') # zhuce.py import mySql from tkinter import * from tkinter import messagebox class zhuce: def wins(self): self.registered = Tk() self.registered.title('registered') self.registered.geometry('230x150+100+380') Label(self.registered, text='用户注册').grid(row=0, column=0, columnspan=2) Label(self.registered, text='用户名:').grid(row=1, column=0, sticky=E) self.names = Entry(self.registered) self.names.grid(row=1, column=1) Label(self.registered, text='密码:').grid(row=2, column=0, sticky=E) self.passwds = Entry(self.registered, show='*') self.passwds.grid(row=2, column=1) Label(self.registered, text='确认密码:').grid(row=3, column=0) self.repasswd = Entry(self.registered, show='*') self.repasswd.grid(row=3, column=1) Button(self.registered, text='注册', command=self.registeredes).grid(row=4, column=0, columnspan=2, pady=5) self.registered.mainloop() def registeredes(self): mS = mySql.mySql() name = self.names.get() passwd = self.passwds.get() re = mS.useSql(name) if passwd != self.repasswd.get(): messagebox.showerror(title='注册失败', message='两次密码不一致') elif name == '': messagebox.showerror(title='注册失败', message='用户名为空') elif passwd == '': messagebox.showerror(title='注册失败', message='密码为空') elif re: messagebox.showerror(title='注册失败', message='用户名已存在') else: mS.crUse(name, passwd) messagebox.showinfo(title='成功', message='注册成功') self.registered.destroy()请注意源码中的绝对路径,请做相应的修改