[Node.js] 模块化 -- 爬虫

技术2024-04-12 84

使用第三方模块 1.新建一个文件夹，文件夹的名字非中文，名字也不要和模块名字一样. 2.进到文件夹里面去，命令运行:npm init -y这个命令可以理解成是初始化 3.下载模块，去npm官网搜索模块,用他的说明来下 4.使用模块，去模块的官网，或者模块说明中来使用.

爬取文本内容

下载环节

创建一个文件夹(在这里我们以npmUse文件夹为例)打开cmd,找到指定文件夹下,输入命令npm init -y在npm官网上找到指定的模块,下载

使用环节

部分效果展示代码展示 var Crawler = require("crawler"); const fs = require('fs') var c = new Crawler({ maxConnections : 10, // This will be called for each crawled page callback : function (error, res, done) { if(error){ console.log(error); }else{ var $ = res.$; // $ is Cheerio by default //a lean implementation of core jQuery designed specifically for the server // console.log($("title").text()); //把丁香园的新冠肺炎实时数据网站的body内容爬到，爬到后存到一个文件里面 fs.writeFile('./temp/1.txt',$("body").text(),(err)=>{ if(err == null){ console. log('爬取并保持成功!'); } }) } done(); } }); //新冠肺炎实时数据丁香园网站的， c. queue( 'https://ncov.dxy.cn/ncovh5/view/pneumonia' )

爬取文件和视频

下载环节

创建一个文件夹(在这里我们以npmUse文件夹为例)打开cmd,找到指定文件夹下,输入命令npm init -y在npm官网上找到指定的模块,下载

使用环节

效果展示代码展示 var Crawler = require("crawler"); var fs = require('fs'); var c = new Crawler({ encoding:null, jQuery:false,// set false to suppress warning message. callback:function(err, res, done){ if(err){ console.error(err.stack); }else{ fs.createWriteStream(res.options.filename).write(res.body); } done(); } }); // 爬取网站中的一个图片 c.queue({ uri:"http://pic1.sc.chinaz.com/files/pic/pic9/202005/apic25534.jpg", filename:"./picture.png" }); //爬取b站视频 c.queue({ uri:"//视频地址", filename:"./video.MP4", //让服务端伪装成客户端 headers :{'User -Agent': ' requests'} });

注意: 当我们在爬取视频的时候,有时候会碰到反爬,所以要做一些小小的伪装headers :{'User -Agent': ' requests'}

爬取王者荣耀内容

//抓包:用爬虫crawler插件来爬网页上的数据 //入库:用mysq1-ithm插偃把爬到的数据装进数据库中. //1.抓包 //导包 var Crawler = require("crawler"); //创建一个爬虫实例 var c = new Crawler({ maxConnections : 10, // This will be called for each crawled page callback : function (error, res, done) { if(error){ console.log(error); }else{ var $ = res.$; // console.log(JSON.parse(res.body));//所有的英雄，这是一个包含了很多对象的数组 //所有的英雄都要去获取他的头像和技能.所以要遍历出每一个英雄的ename //所以要遍历出每一个英雄的ename，拼接一个详情页路径重新发请求. JSON.parse(res.body).forEach((v)=>{ // console.log(`https://pvp.qq.com/web201605/herodetail/${v.ename}.shtml`) //详情请求 xq.queue(`https://pvp.qq.com/web201605/herodetail/${v.ename}.shtml`); }) } done(); } }); //发请求 // Queue just one URL, with default callback c.queue('https://pvp.qq.com/web201605/js/herolist.json'); // 声明一个全局变量heros数组，用来存放所有的英雄的. let heros= [] // 创建一个请求详情的爬虫实例 var xq = new Crawler({ maxConnections : 10, // This will be called for each crawled page callback : function (error, res, done) { if(error){ console.log(error); }else{ var $ = res.$; //英雄名字英雄技能英雄头像 // console.log($(".cover-name").text(),$(".skill-name>b").first().text(),'https:' + $(".ico-play").prev('img').attr('src')); //把获取到的每一个英雄的名字，技能,头像都添加到这个数组中， heros.push({ heroName: $(".cover-name").text(), heroSkill: $(".skill-name>b").first().text(), heroIcon:'https:' + $(".ico-play").prev('img').attr('src'), isDelete: false }) } done(); } }); //要等待所有的请求全部做完之后，才入库 xq.on( 'drain', function(){ heroModel.insert(heros, (err, results) => { console.log(err); console.log(results); if (!err) console.log('增加成功'); }); }); //2.入库 //1.导入模块 const hm = require('mysql-ithm'); //2.连接数据库 //如果数据库存在则连接，不存在则会自动创建数据库 hm.connect({ host: 'localhost',//数据库地址 port:'3306', user: 'root',//用户名，没有可不填 password: '123',//密码，没有可不填 database: 'cqmanager503'//数据库名称 }); //3.创建Model(表格模型：负责增删改查) //如果table表格存在则连接，不存在则自动创建 let heroModel = hm.model('hero',{ heroName:String, heroSkill:String, heroIcon:String, isDelete:String });

Processed: 0.030, SQL: 9