在网络爬虫中,经常需要设置一些头信息。设置头信息的作用是伪装网络爬虫,使得网络爬虫请求网页更像浏览器访问网页(当然也可以通过java的selenium框架来实现),进而降低网络爬虫被网站封锁的风险。HttpClient工具提供了两种设置头信息的方法,如程序3-15以及3-16所示。
//程序3-15 public class HttpClientSetHeader { public static void main(String[] args) throws Exception { //初始化httpclient HttpClient httpClient = HttpClients.custom().build(); //使用的请求方法 HttpGet httpget = new HttpGet("https://searchcustomerexperience.techtarget.com/info/news"); //请求头配置 httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); httpget.setHeader("Accept-Encoding", "gzip, deflate"); httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.9"); httpget.setHeader("Cache-Control", "max-age=0"); httpget.setHeader("Host", "searchcustomerexperience.techtarget.com"); httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36"); //这项内容很重要 //发出get请求 HttpResponse response = httpClient.execute(httpget); //获取响应状态码 int code = response.getStatusLine().getStatusCode(); //获取网页内容流 HttpEntity httpEntity = response.getEntity(); //以字符串的形式(需设置编码) String entity = EntityUtils.toString(httpEntity, "gbk"); //输出所获得的的内容 System.out.println(code + "\n" + entity); //关闭内容流 EntityUtils.consume(httpEntity); } } //程序3-16 public class HttpclientSetHeader { public static void main(String[] args) throws Exception { //通过集合封装头信息 List<Header> headerList = new ArrayList<Header>(); headerList.add(new BasicHeader(HttpHeaders.ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")); headerList.add(new BasicHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36")); headerList.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate")); headerList.add(new BasicHeader(HttpHeaders.CACHE_CONTROL, "max-age=0")); headerList.add(new BasicHeader(HttpHeaders.CONNECTION, "keep-alive")); headerList.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE, "zh-CN,zh;q=0.9")); headerList.add(new BasicHeader(HttpHeaders.HOST, "searchcustomerexperience.techtarget.com")); //构造自定义的HttpClient对象 HttpClient httpClient = HttpClients.custom().setDefaultHeaders(headerList).build(); //使用的请求方法 HttpGet httpget = new HttpGet("https://searchcustomerexperience.techtarget.com/info/news"); //发出get请求并获取结果 HttpResponse response = httpClient.execute(httpget); //获取响应状态码 int code = response.getStatusLine().getStatusCode(); //获取网页内容流 HttpEntity httpEntity = response.getEntity(); //以字符串的形式(需设置编码) String entity = EntityUtils.toString(httpEntity, "gbk"); //输出所获得的的内容 System.out.println(code + "\n" + entity); //关闭内容流 EntityUtils.consume(httpEntity); } }