网络爬虫在采集少量数据时,可以使用Excel进行存储。 Java中主要有两款操作Excel的工具JXL(主要处理xls格式的Excel)和POI(可处理xls、xlsx、word等格式的文件)。本小节利用POI封装了Excel读取和写入方法,供读者参考。 使用Excel存储数据之前,需要在Maven工程的pom.xml文件中添加Excel的dependency。
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.15</version> </dependency>如程序5-2所示,将采集到的URL存入excel表格。
//程序5-2 public class ExcelStorage { public static void main(String[] args) throws IOException { //需要写入到excel的内容 //这里选择使用map数据结构来存储excel所对应的一行数据 List<Map<String,String>> excelMapList = new LinkedList<>(); //通过Jsoup创建和url的连接 Connection connection = Jsoup.connect("https://searchcustomerexperience.techtarget.com/info/news"); //获取网页的Document对象 Document document = connection.get(); Elements aList = document.getElementsByTag("a"); if(aList.size()>0){ for(int i = 0;i<aList.size();i++){ Element a = aList.get(i); if(a != null){ String url = a.attr("href"); if(url.contains("http")) { Map<String,String> map = new HashMap<>(); map.put("f1",url); excelMapList.add(map); } } } } //设置excel表头 List<String> headList = new ArrayList<>(); headList.add("url"); //创建含有表头的excel XSSFWorkbook workbook = ExcelUtils.getWorkbook("Sheet1", headList); //将excelMapList内容写入Excel String excelPath = "/Users/steven/Documents/代码/project/spider/src/main/java/com/topicBet/url.xlsx"; ExcelUtils.writeExcelContent(workbook, "Sheet1", excelPath, excelMapList); } } @Slf4j public class ExcelUtils { /** * 读取excel文件内容,文件需要含有表头,如果没有表头请以f1,f2,f3......填充 * * @param filePath 文件路径 * @param sheetAt 表格 * @return List<Map<String,String>> */ public static List<Map<String, String>> readExcelContent(String filePath, Integer sheetAt) { List<Map<String, String>> mapList = new ArrayList<>(); FileInputStream fileInputStream = null; try { fileInputStream = new FileInputStream(filePath); // 延迟解析比率 ZipSecureFile.setMinInflateRatio(-1.0d); XSSFWorkbook workbook = new XSSFWorkbook(fileInputStream); XSSFSheet sheet = workbook.getSheetAt(sheetAt); int rows = sheet.getPhysicalNumberOfRows(); for (int i = 1; i <= rows; i++) { //此行不为空行 if (sheet.getRow(i) != null) { Map<String, String> map = new HashMap<>(); int columns = sheet.getRow(i).getPhysicalNumberOfCells(); for (int j = 0; j < columns; j++) { map.put("f" + (j + 1), getString(sheet.getRow(i).getCell(j)).trim()); } mapList.add(map); } } } catch (Exception e) { log.error("读取文件出现异常"); } return mapList; } /** * 把单元格的内容转为字符串 * * @param xssfCell 单元格 * @return String */ public static String getString(XSSFCell xssfCell) { if (xssfCell == null) { return ""; } if (xssfCell.getCellTypeEnum() == CellType.NUMERIC) { DecimalFormat df = new DecimalFormat("0"); return df.format(xssfCell.getNumericCellValue()); } else if (xssfCell.getCellTypeEnum() == CellType.BOOLEAN) { return String.valueOf(xssfCell.getBooleanCellValue()); } else { return xssfCell.getStringCellValue(); } } /** * 创建含有表头的excel * * @param sheetName 表格名称 * @param headList 表头字符串列表 * @return XSSFWorkbook */ public static XSSFWorkbook getWorkbook(String sheetName, List<String> headList) { XSSFWorkbook workbook = new XSSFWorkbook(); XSSFSheet sheet = workbook.createSheet(sheetName); XSSFRow titleRow = sheet.createRow(0); for (int i = 0; i < headList.size(); i++) { XSSFCell cell = titleRow.createCell(i); cell.setCellValue(headList.get(i)); } return workbook; } /** * 将需要输出的内容填充到excel并保存到本地 * * @param workbook 表格 * @param sheetName 表格名称 * @param path 输出到本地的路径 * @param excelMapList 需要输出的内容 */ public static void writeExcelContent(XSSFWorkbook workbook, String sheetName, String path, List<Map<String, String>> excelMapList) { // 延迟解析比率 ZipSecureFile.setMinInflateRatio(-1.0d); XSSFSheet sheet = workbook.getSheet(sheetName); for (int i = 1; i <= excelMapList.size(); i++) { XSSFRow row = sheet.createRow(i); Map<String, String> map = excelMapList.get(i - 1); for (int j = 0; j < map.size(); j++) { XSSFCell cell = row.createCell(j); cell.setCellValue(map.get("f" + (j + 1))); } } //用输出流写到excel FileOutputStream outputStream = null; try { outputStream = new FileOutputStream(path); workbook.write(outputStream); outputStream.flush(); outputStream.close(); } catch (Exception e) { log.error("写入excel发生异常"); } } }