java大数据量导入 Java实现excel大数据量导入

软件发布|下载排行|最新软件

当前位置:首页IT学院IT技术

java大数据量导入 Java实现excel大数据量导入

王绍桦   2021-03-16 我要评论
想了解Java实现excel大数据量导入的相关内容吗,王绍桦在本文为您仔细讲解java大数据量导入的相关知识和一些Code实例,欢迎阅读和指正,我们先划重点:java,excel大数据量导入,java大数据量导入,java,excel数据导入,下面大家一起来学习吧。

情景分析:

通常我们通过poi读取excel文件时,若在用户模式下,由于数据量较大、Sheet较多,很容易出现内存溢出的情况

用户模式读取excel的典型代码如下:

FileInputStream file = new FileInputStream("c:\\test.xlsx");
Workbook wb=new XSSFWorkbook(file);

而03版(xls)excel文件每个sheet最大长度为 65536行,07版(xlsx)excel文件每个sheet最大长度为 1048576行,因此我们主要解决07版(xlsx)excel大数据量导入出现内存溢出的问题

由于读取cvs文件效率高且占用内存少,所以我们采用xlsx转换为cvs的方式来解决

读取xlsx格式数据:

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
 
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
 
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
 
public class XLSX2CSV {
 
 enum xssfDataType {
 BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER,
 }
 
 /**
 * 使用xssf_sax_API处理Excel,请参考: http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api
 */
 class MyXSSFSheetHandler extends DefaultHandler {
 
 private StylesTable stylesTable;
 
 private ReadOnlySharedStringsTable sharedStringsTable;
 
 private final PrintStream output;
 
 private final int minColumnCount;
 
 private boolean vIsOpen;
 
 private xssfDataType nextDataType;
 
 private short formatIndex;
 private String formatString;
 private final DataFormatter formatter;
 
 private int thisColumn = -1;
 private int lastColumnNumber = -1;
 
 private StringBuffer value;
 private String[] record;
 private List<String[]> rows = new ArrayList<String[]>();
 private boolean isCellNull = false;
 
 public MyXSSFSheetHandler(StylesTable styles,ReadOnlySharedStringsTable strings, int cols, PrintStream target) {
 this.stylesTable = styles;
 this.sharedStringsTable = strings;
 this.minColumnCount = cols;
 this.output = target;
 this.value = new StringBuffer();
 this.nextDataType = xssfDataType.NUMBER;
 this.formatter = new DataFormatter();
 record = new String[this.minColumnCount];
 rows.clear();// 每次读取都清空行集合
 }
 
 public void startElement(String uri, String localName, String name,Attributes attributes) throws SAXException {
 if ("inlineStr".equals(name) || "v".equals(name)) {
 vIsOpen = true;
 // Clear contents cache
 value.setLength(0);
 }
 // c => cell
 else if ("c".equals(name)) {
 // Get the cell reference
 String r = attributes.getValue("r");
 int firstDigit = -1;
 for (int c = 0; c < r.length(); ++c) {
  if (Character.isDigit(r.charAt(c))) {
  firstDigit = c;
  break;
  }
 }
 thisColumn = nameToColumn(r.substring(0, firstDigit));
 
 // Set up defaults.
 this.nextDataType = xssfDataType.NUMBER;
 this.formatIndex = -1;
 this.formatString = null;
 String cellType = attributes.getValue("t");
 String cellStyleStr = attributes.getValue("s");
 if ("b".equals(cellType))
  nextDataType = xssfDataType.BOOL;
 else if ("e".equals(cellType))
  nextDataType = xssfDataType.ERROR;
 else if ("inlineStr".equals(cellType))
  nextDataType = xssfDataType.INLINESTR;
 else if ("s".equals(cellType))
  nextDataType = xssfDataType.SSTINDEX;
 else if ("str".equals(cellType))
  nextDataType = xssfDataType.FORMULA;
 else if (cellStyleStr != null) {
  // It's a number, but almost certainly one
  // with a special style or format
  int styleIndex = Integer.parseInt(cellStyleStr);
  XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
  this.formatIndex = style.getDataFormat();
  this.formatString = style.getDataFormatString();
  if (this.formatString == null)
  this.formatString = BuiltinFormats
  .getBuiltinFormat(this.formatIndex);
 }
 }
 
 }
 
 public void endElement(String uri, String localName, String name)throws SAXException {
 
 String thisStr = null;
 
 // v => contents of a cell
 if ("v".equals(name)) {
 // Process the value contents as required.
 // Do now, as characters() may be called more than once
 switch (nextDataType) {
 
 case BOOL:
  char first = value.charAt(0);
  thisStr = first == '0' ? "FALSE" : "TRUE";
  break;
 
 case ERROR:
  thisStr = "\"ERROR:" + value.toString() + '"';
  break;
 
 case FORMULA:
  // A formula could result in a string value,
  // so always add double-quote characters.
  thisStr = '"' + value.toString() + '"';
  break;
 
 case INLINESTR:
  // TODO: have seen an example of this, so it's untested.
  XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
  thisStr = '"' + rtsi.toString() + '"';
  break;
 
 case SSTINDEX:
  String sstIndex = value.toString();
  try {
  int idx = Integer.parseInt(sstIndex);
  XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
  thisStr = rtss.toString();
  } catch (NumberFormatException ex) {
  output.println("Failed to parse SST index '" + sstIndex
  + "': " + ex.toString());
  }
  break;
 
 case NUMBER:
  String n = value.toString();
  // 判断是否是日期格式
  if (HSSFDateUtil.isADateFormat(this.formatIndex, n)) {
  Double d = Double.parseDouble(n);
  Date date=HSSFDateUtil.getJavaDate(d);
  thisStr=formateDateToString(date);
  } else if (this.formatString != null)
  thisStr = formatter.formatRawCellContents(
  Double.parseDouble(n), this.formatIndex,
  this.formatString);
  else
  thisStr = n;
  break;
 
 default:
  thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
  break;
 }
 
 // Output after we've seen the string contents
 // Emit commas for any fields that were missing on this row
 if (lastColumnNumber == -1) {
  lastColumnNumber = 0;
 }
 //判断单元格的值是否为空
 if (thisStr == null || "".equals(isCellNull)) {
  isCellNull = true;// 设置单元格是否为空值
 }
 record[thisColumn] = thisStr;
 // Update column
 if (thisColumn > -1)
  lastColumnNumber = thisColumn;
 
 } else if ("row".equals(name)) {
 // Print out any missing commas if needed
 if (minColumns > 0) {
  // Columns are 0 based
  if (lastColumnNumber == -1) {
  lastColumnNumber = 0;
  }
  if (isCellNull == false && record[0] != null
  && record[1] != null)// 判断是否空行
  {
  rows.add(record.clone());
  isCellNull = false;
  for (int i = 0; i < record.length; i++) {
  record[i] = null;
  }
  }
 }
 lastColumnNumber = -1;
 }
 
 }
 
 public List<String[]> getRows() {
 return rows;
 }
 
 public void setRows(List<String[]> rows) {
 this.rows = rows;
 }
 
 /**
 * Captures characters only if a suitable element is open. Originally
 * was just "v"; extended for inlineStr also.
 */
 public void characters(char[] ch, int start, int length)throws SAXException {
 if (vIsOpen)
 value.append(ch, start, length);
 }
 
 /**
 * Converts an Excel column name like "C" to a zero-based index.
 * @param name
 * @return Index corresponding to the specified name
 */
 private int nameToColumn(String name) {
 int column = -1;
 for (int i = 0; i < name.length(); ++i) {
 int c = name.charAt(i);
 column = (column + 1) * 26 + c - 'A';
 }
 return column;
 }
 
 private String formateDateToString(Date date) {
 SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//格式化日期
 return sdf.format(date);
 
 }
 
 }
 
 private OPCPackage xlsxPackage;
 private int minColumns;
 private PrintStream output;
 
 /**
 * Creates a new XLSX -> CSV converter
 * 
 * @param pkg
 *   The XLSX package to process
 * @param output
 *   The PrintStream to output the CSV to
 * @param minColumns
 *   The minimum number of columns to output, or -1 for no minimum
 */
 public XLSX2CSV(OPCPackage pkg, PrintStream output,int minColumns) {
 this.xlsxPackage = pkg;
 this.output = output;
 this.minColumns = minColumns;
 }
 
 /**
 * Parses and shows the content of one sheet using the specified styles and
 * shared-strings tables.
 * @param styles
 * @param strings
 * @param sheetInputStream
 */
 public List<String[]> processSheet(StylesTable styles,
 ReadOnlySharedStringsTable strings, InputStream sheetInputStream)
 throws IOException, ParserConfigurationException, SAXException {
 InputSource sheetSource = new InputSource(sheetInputStream);
 SAXParserFactory saxFactory = SAXParserFactory.newInstance();
 SAXParser saxParser = saxFactory.newSAXParser();
 XMLReader sheetParser = saxParser.getXMLReader();
 MyXSSFSheetHandler handler = new MyXSSFSheetHandler(styles, strings,this.minColumns, this.output);
 sheetParser.setContentHandler(handler);
 sheetParser.parse(sheetSource);
 return handler.getRows();
 }
 
 /**
 * 初始化这个处理程序 
 * @throws IOException
 * @throws OpenXML4JException
 * @throws ParserConfigurationException
 * @throws SAXException
 */
 public List<String[]> process() throws IOException, OpenXML4JException,ParserConfigurationException, SAXException {
 ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage);
 XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
 List<String[]> list = null;
 StylesTable styles = xssfReader.getStylesTable();
 XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
 int index = 0;
 while (iter.hasNext()) {
 InputStream stream = iter.next();
 index++;
 //默认取第一个工作薄
 if(index == 1){
 list = processSheet(styles, strings, stream);
 stream.close();
 }
 }
 return list;
 }
 
 /**
 * 读取Excel
 * 
 * @param path
 *   文件路径
 * @param sheetName
 *   sheet名称
 * @param minColumns
 *   列总数
 * @return
 * @throws SAXException
 * @throws ParserConfigurationException
 * @throws OpenXML4JException
 * @throws IOException
 */
 private static List<String[]> readerExcel(String path,int minColumns) throws IOException, OpenXML4JException,ParserConfigurationException, SAXException {
 OPCPackage p = OPCPackage.open(path, PackageAccess.READ);
 XLSX2CSV xlsx2csv = new XLSX2CSV(p, System.out, minColumns);
 List<String[]> list = xlsx2csv.process();
 p.close();
 return list;
 }
 
 /**
 * 读取Excel
 * 
 * @param file
 *   File
 * @param sheetName
 *   sheet名称
 * @param minColumns
 *   列总数
 * @return
 * @throws SAXException
 * @throws ParserConfigurationException
 * @throws OpenXML4JException
 * @throws IOException
 */
 private static List<String[]> readerExcelInputStream(File file,int minColumns) throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {
 OPCPackage p = OPCPackage.openOrCreate(file);
 XLSX2CSV xlsx2csv = new XLSX2CSV(p, System.out, minColumns);
 List<String[]> list = xlsx2csv.process();
 p.close();
 return list;
 }
 
 //获取表头及数据内容,可在业务层调用
 public static Map<String,Object> getTableDate(InputStream in) throws IOException, OpenXML4JException, ParserConfigurationException, SAXException{ 
 File out = new File("C:\\date.xlsx");//默认在C盘生成date.xlsx文件
 OutputStream output = null;
 try {
 output = new FileOutputStream(out);
 byte[] buf = new byte[1024];
 int bytesRead;
 while ((bytesRead = in.read(buf)) > 0) {
 output.write(buf, 0, bytesRead);
 }
 } finally {
 in.close();
 output.close();
 }
 Map<String,Object> map = new HashMap<String,Object>();
 List<String> headresult = new ArrayList<String>();//表头集合
 List<List<String>> dataresult = new ArrayList<List<String>>();//表数据集合
 List<String[]> list = readerExcelInputStream(out,30);//默认读取30列
 for(int i=0;i<list.size();i++){
 if(i == 0){
 for(String cell : list.get(i)){
  if(cell != null){  
  headresult.add(cell);
  }
 }
 }else{
 List<String> dataList = new ArrayList<String>();
 for(String cell : list.get(i)){
  dataList.add(cell == null ? "" : cell);
 }
 dataresult.add(dataList);
 }
 }
 map.put("headresult", headresult);
 map.put("dataresult", dataresult);
 return map;
 }
 
 public static void main(String[] args) throws Exception {
 File file = new File("C:/Users/Administrator/Desktop/测试.xlsx");
 System.out.println("开始读取...");
 List<String[]> list = XLSX2CSV.readerExcelInputStream(file, 30);//默认读取30列
 System.out.println("数据量:"+list.size());
 for(int i=0;i<list.size();i++){
 if(i > 0){
 for (String cell : list.get(i)) {
  System.out.print(cell + ",");
 }
 System.out.println();
 }
 }
 }
 
}

说明:

代码中的加载excel的核心方法:

OPCPackage pkg = OPCPackage.open(path);//文件路径
OPCPackage pkg = OPCPackage.openOrCreate(file);//文件
OPCPackage pkg = OPCPackage.open(InputStream);//文件流

文件流方式对内存依赖极大,所以实际应用时,如果只能获取文件流的话,可以先将文件通过流拷贝到本地再解析代码中的

若上传文件大小超过限制,可在配置文件中设置,SpringBoot2项目application.properties中设置:

spring.servlet.multipart.maxFileSize=50MB
spring.servlet.multipart.maxRequestSize=100MB

实际业务数据入库的话也可采用异步任务@Async的方式来提高入库效率:

import java.util.Map;
 
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
 
import cn.com.app.dao.JkconfigDao;
 
@Service
public class JkrzService {
 
 @Autowired
 private JkconfigDao jkconfigDao;
 
 @Async("myTaskAsyncPool")
 public void transJkrz(Map<String, Object> m) {
 jkconfigDao.insert(m);
 }
}

线程池配置:

import java.util.concurrent.Executor;
import java.util.concurrent.ThreadPoolExecutor;
 
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
 
@Configuration
@EnableAsync
public class AsyncTaskConfig {
 
 @Autowired 
 private TaskThreadPoolConfig config;
 
 // ThredPoolTaskExcutor的处理流程
 // 当池子大小小于corePoolSize,就新建线程,并处理请求
 // 当池子大小等于corePoolSize,把请求放入workQueue中,池子里的空闲线程就去workQueue中取任务并处理
 // 当workQueue放不下任务时,就新建线程入池,并处理请求,如果池子大小撑到了maximumPoolSize,就用RejectedExecutionHandler来做拒绝处理
 // 当池子的线程数大于corePoolSize时,多余的线程会等待keepAliveTime长时间,如果无请求可处理就自行销毁
 // 当threadNamePrefix设置为true,则核心线程也会超时关闭 
 @Bean
 public Executor myTaskAsyncPool() {
  ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); 
   executor.setCorePoolSize(config.getCorePoolSize()); 
   executor.setMaxPoolSize(config.getMaxPoolSize()); 
   executor.setQueueCapacity(config.getQueueCapacity()); 
   executor.setKeepAliveSeconds(config.getKeepAliveSeconds()); 
   executor.setAllowCoreThreadTimeOut(true);
   executor.setThreadNamePrefix("MyExecutor-"); 
 
   // rejection-policy:当pool已经达到max size的时候,如何处理新任务 
   // CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行 
   executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); 
   executor.initialize(); 
  return executor;
 }
 
}

参数配置:

#核心线程数,当线程数小于核心线程数时,即使有线程空闲,线程池也会优先创建新线程,设置allowCoreThreadTimeout=true(默认false)时,核心线程会超时退出
spring.task.pool.corePoolSize=20
#最大线程数,当线程数大于等于corePoolSize,且任务队列已满时,线程池会创建新线程来处理任务
spring.task.pool.maxPoolSize=60
#线程空闲时间,当线程空闲时间达到keepAliveSeconds(秒)时,线程会退出,直到线程数量等于corePoolSize,如果allowCoreThreadTimeout=true,则会直到线程数量等于0
spring.task.pool.keepAliveSeconds=1
#任务队列容量,当核心线程数达到最大时,新任务会放在队列中排队等待执行
spring.task.pool.queueCapacity=400

读取xls格式数据:

import org.apache.poi.hssf.eventusermodel.*;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.*;
import org.apache.poi.hssf.usermodel.HSSFDataFormatter;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
 
/**
 * @desc 用于解决.xls2003版本大数据量问题
 **/
public class ExcelXlsReader implements HSSFListener {
 
 //存储所有数据
 private List<List<String>> dataList = new ArrayList<List<String>>();
 
 private int minColums = -1;
 
 private POIFSFileSystem fs;
 
 /**
  * 总行数
  */
 private int totalRows=0;
 
 /**
  * 上一行row的序号
  */
 private int lastRowNumber;
 
 /**
  * 上一单元格的序号
  */
 private int lastColumnNumber;
 
 /**
  * 是否输出formula,还是它对应的值
  */
 private boolean outputFormulaValues = true;
 
 /**
  * 用于转换formulas
  */
 private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
 
 //excel2003工作簿
 private HSSFWorkbook stubWorkbook;
 
 private SSTRecord sstRecord;
 
 private FormatTrackingHSSFListener formatListener;
 
 private final HSSFDataFormatter formatter = new HSSFDataFormatter();
 
 /**
  * 文件的绝对路径
  */
 private String filePath = "";
 
 //表索引
 private int sheetIndex = 0;
 
 private BoundSheetRecord[] orderedBSRs;
 
 @SuppressWarnings("unchecked")
 private ArrayList boundSheetRecords = new ArrayList();
 
 private int nextRow;
 
 private int nextColumn;
 
 private boolean outputNextStringRecord;
 
 //当前行
 private int curRow = 0;
 
 //存储一行记录所有单元格的容器
 private List<String> cellList = new ArrayList<String>();
 
 /**
  * 判断整行是否为空行的标记
  */
 private boolean flag = false;
 
 @SuppressWarnings("unused")
 private String sheetName;
 
 /**
  * 遍历excel下所有的sheet
  *
  * @param fileName
  * @throws Exception
  */
 public int process(String fileName) throws Exception {
  filePath = fileName;
  this.fs = new POIFSFileSystem(new FileInputStream(fileName));
  MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
  formatListener = new FormatTrackingHSSFListener(listener);
  HSSFEventFactory factory = new HSSFEventFactory();
  HSSFRequest request = new HSSFRequest();
  if (outputFormulaValues) {
   request.addListenerForAllRecords(formatListener);
  } else {
   workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(formatListener);
   request.addListenerForAllRecords(workbookBuildingListener);
  }
  factory.processWorkbookEvents(request, fs);
 
  return totalRows; //返回该excel文件的总行数,不包括首列和空行
 }
 
 public List<List<String>> process(InputStream in) throws Exception {
  //filePath = fileName;
  this.fs = new POIFSFileSystem(in);
  MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
  formatListener = new FormatTrackingHSSFListener(listener);
  HSSFEventFactory factory = new HSSFEventFactory();
  HSSFRequest request = new HSSFRequest();
  if (outputFormulaValues) {
   request.addListenerForAllRecords(formatListener);
  } else {
   workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(formatListener);
   request.addListenerForAllRecords(workbookBuildingListener);
  }
  factory.processWorkbookEvents(request, fs);
 
  return dataList; //返回该excel文件的总行数,不包括首列和空行
 }
 
 /**
  * HSSFListener 监听方法,处理Record
  * 处理每个单元格
  * @param record
  */
 @SuppressWarnings("unchecked")
 public void processRecord(Record record) {
  int thisRow = -1;
  int thisColumn = -1;
  String thisStr = null;
  String value = null;
  switch (record.getSid()) {
   case BoundSheetRecord.sid:
    boundSheetRecords.add(record);
    break;
   case BOFRecord.sid: //开始处理每个sheet
    BOFRecord br = (BOFRecord) record;
    if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
     //如果有需要,则建立子工作簿
     if (workbookBuildingListener != null && stubWorkbook == null) {
      stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
     }
 
     if (orderedBSRs == null) {
      orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
     }
     sheetName = orderedBSRs[sheetIndex].getSheetname();
     sheetIndex++;
    }
    break;
   case SSTRecord.sid:
    sstRecord = (SSTRecord) record;
    break;
   case BlankRecord.sid: //单元格为空白
    BlankRecord brec = (BlankRecord) record;
    thisRow = brec.getRow();
    thisColumn = brec.getColumn();
    thisStr = "";
    cellList.add(thisColumn, thisStr);
    break;
   case BoolErrRecord.sid: //单元格为布尔类型
    BoolErrRecord berec = (BoolErrRecord) record;
    thisRow = berec.getRow();
    thisColumn = berec.getColumn();
    thisStr = berec.getBooleanValue() + "";
    cellList.add(thisColumn, thisStr);
    checkRowIsNull(thisStr); //如果里面某个单元格含有值,则标识该行不为空行
    break;
   case FormulaRecord.sid://单元格为公式类型
    FormulaRecord frec = (FormulaRecord) record;
    thisRow = frec.getRow();
    thisColumn = frec.getColumn();
    if (outputFormulaValues) {
     if (Double.isNaN(frec.getValue())) {
      outputNextStringRecord = true;
      nextRow = frec.getRow();
      nextColumn = frec.getColumn();
     } else {
      thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
     }
    } else {
     thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
    }
    cellList.add(thisColumn, thisStr);
    checkRowIsNull(thisStr); //如果里面某个单元格含有值,则标识该行不为空行
    break;
   case StringRecord.sid: //单元格中公式的字符串
    if (outputNextStringRecord) {
     StringRecord srec = (StringRecord) record;
     thisStr = srec.getString();
     thisRow = nextRow;
     thisColumn = nextColumn;
     outputNextStringRecord = false;
    }
    break;
   case LabelRecord.sid:
    LabelRecord lrec = (LabelRecord) record;
    curRow = thisRow = lrec.getRow();
    thisColumn = lrec.getColumn();
    value = lrec.getValue().trim();
    value = value.equals("") ? "" : value;
    cellList.add(thisColumn, value);
    checkRowIsNull(value); //如果里面某个单元格含有值,则标识该行不为空行
    break;
   case LabelSSTRecord.sid: //单元格为字符串类型
    LabelSSTRecord lsrec = (LabelSSTRecord) record;
    curRow = thisRow = lsrec.getRow();
    thisColumn = lsrec.getColumn();
    if (sstRecord == null) {
     cellList.add(thisColumn, "");
    } else {
     value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();
     value = value.equals("") ? "" : value;
     cellList.add(thisColumn, value);
     checkRowIsNull(value); //如果里面某个单元格含有值,则标识该行不为空行
    }
    break;
   case NumberRecord.sid: //单元格为数字类型
    NumberRecord numrec = (NumberRecord) record;
    curRow = thisRow = numrec.getRow();
    thisColumn = numrec.getColumn();
 
    //第一种方式
    //value = formatListener.formatNumberDateCell(numrec).trim();//这个被写死,采用的m/d/yy h:mm格式,不符合要求
 
    //第二种方式,参照formatNumberDateCell里面的实现方法编写
    Double valueDouble=((NumberRecord)numrec).getValue();
    String formatString=formatListener.getFormatString(numrec);
    if (formatString.contains("m/d/yy")){
     formatString="yyyy-MM-dd hh:mm:ss";
    }
    int formatIndex=formatListener.getFormatIndex(numrec);
    value=formatter.formatRawCellContents(valueDouble, formatIndex, formatString).trim();
 
    value = value.equals("") ? "" : value;
    //向容器加入列值
    cellList.add(thisColumn, value);
    checkRowIsNull(value); //如果里面某个单元格含有值,则标识该行不为空行
    break;
   default:
    break;
  }
 
  //遇到新行的操作
  if (thisRow != -1 && thisRow != lastRowNumber) {
   lastColumnNumber = -1;
  }
 
  //空值的操作
  if (record instanceof MissingCellDummyRecord) {
   MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
   curRow = thisRow = mc.getRow();
   thisColumn = mc.getColumn();
   cellList.add(thisColumn, "");
  }
 
  //更新行和列的值
  if (thisRow > -1)
   lastRowNumber = thisRow;
  if (thisColumn > -1)
   lastColumnNumber = thisColumn;
 
  //行结束时的操作
  if (record instanceof LastCellOfRowDummyRecord) {
   if (minColums > 0) {
    //列值重新置空
    if (lastColumnNumber == -1) {
     lastColumnNumber = 0;
    }
   }
   lastColumnNumber = -1;
 
   if (flag&&curRow!=0) { //该行不为空行且该行不是第一行,发送(第一行为列名,不需要)
    //ExcelReaderUtil.sendRows(filePath, sheetName, sheetIndex, curRow + 1, cellList); //每行结束时,调用sendRows()方法
    totalRows++;
    //添加到数据集合中
    dataList.add(cellList);
   }
   //清空容器
   cellList = new ArrayList<String>(); 
   //cellList.clear();
   flag=false;
  }
 }
 
 /**
  * 如果里面某个单元格含有值,则标识该行不为空行
  * @param value
  */
 public void checkRowIsNull(String value){
  if (value != null && !"".equals(value)) {
   flag = true;
  }
 }
}

读取xlsx格式数据(自动获取表头长度方式):

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
 
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
 
/**
 * @desc POI读取excel有两种模式,一种是用户模式,一种是事件驱动模式
 * 采用SAX事件驱动模式解决XLSX文件,可以有效解决用户模式内存溢出的问题,
 * 该模式是POI官方推荐的读取大数据的模式,
 * 在用户模式下,数据量较大,Sheet较多,或者是有很多无用的空行的情况下,容易出现内存溢出
 * <p>
 * 用于解决.xlsx2007版本大数据量问题
 **/
public class ExcelXlsxReader extends DefaultHandler {
 
 //存储所有数据
 private List<List<String>> dataList = new ArrayList<List<String>>();
 
 /**
  * 单元格中的数据可能的数据类型
  */
 enum CellDataType {
  BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
 }
 
 /**
  * 共享字符串表
  */
 private SharedStringsTable sst;
 
 /**
  * 上一次的索引值
  */
 private String lastIndex;
 
 /**
  * 文件的绝对路径
  */
 private String filePath = "";
 
 /**
  * 工作表索引
  */
 private int sheetIndex = 0;
 
 /**
  * sheet名
  */
 private String sheetName = "";
 
 /**
  * 总行数
  */
 private int totalRows=0;
 
 /**
  * 一行内cell集合
  */
 private List<String> cellList = new ArrayList<String>();
 
 /**
  * 判断整行是否为空行的标记
  */
 private boolean flag = false;
 
 /**
  * 当前行
  */
 private int curRow = 1;
 
 /**
  * 当前列
  */
 private int curCol = 0;
 
 /**
  * T元素标识
  */
 private boolean isTElement;
 
 /**
  * 判断上一单元格是否为文本空单元格
  */
 private boolean startElementFlag = true;
 private boolean endElementFlag = false;
 private boolean charactersFlag = false;
 
 /**
  * 异常信息,如果为空则表示没有异常
  */
 private String exceptionMessage;
 
 /**
  * 单元格数据类型,默认为字符串类型
  */
 private CellDataType nextDataType = CellDataType.SSTINDEX;
 
 private final DataFormatter formatter = new DataFormatter();
 
 /**
  * 单元格日期格式的索引
  */
 private short formatIndex;
 
 /**
  * 日期格式字符串
  */
 private String formatString;
 
 //定义前一个元素和当前元素的位置,用来计算其中空的单元格数量,如A6和A8等
 private String prePreRef = "A", preRef = null, ref = null;
 
 //定义该文档一行最大的单元格数,用来补全一行最后可能缺失的单元格
 private String maxRef = null;
 
 /**
  * 单元格
  */
 private StylesTable stylesTable;
 
 /**
  * 遍历工作簿中所有的电子表格
  * 并缓存在mySheetList中
  *
  * @param filename
  * @throws Exception
  */
 public int process(String filename) throws Exception {
  filePath = filename;
  OPCPackage pkg = OPCPackage.open(filename);
  XSSFReader xssfReader = new XSSFReader(pkg);
  stylesTable = xssfReader.getStylesTable();
  SharedStringsTable sst = xssfReader.getSharedStringsTable();
  XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
  this.sst = sst;
  parser.setContentHandler(this);
  XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
  while (sheets.hasNext()) { //遍历sheet
   curRow = 1; //标记初始行为第一行
   sheetIndex++;
   InputStream sheet = sheets.next(); //sheets.next()和sheets.getSheetName()不能换位置,否则sheetName报错
   sheetName = sheets.getSheetName();
   InputSource sheetSource = new InputSource(sheet);
   parser.parse(sheetSource); //解析excel的每条记录,在这个过程中startElement()、characters()、endElement()这三个函数会依次执行
   sheet.close();
  }
  return totalRows; //返回该excel文件的总行数,不包括首列和空行
 }
 
 public List<List<String>> process(InputStream in) throws Exception {
  //filePath = filename;
  OPCPackage pkg = OPCPackage.open(in);
  XSSFReader xssfReader = new XSSFReader(pkg);
  stylesTable = xssfReader.getStylesTable();
  SharedStringsTable sst = xssfReader.getSharedStringsTable();
  //XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
  XMLReader parser = XMLReaderFactory.createXMLReader();
  this.sst = sst;
  parser.setContentHandler(this);
  XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
  while (sheets.hasNext()) { //遍历sheet
   curRow = 1; //标记初始行为第一行
   sheetIndex++;
   InputStream sheet = sheets.next(); //sheets.next()和sheets.getSheetName()不能换位置,否则sheetName报错
   sheetName = sheets.getSheetName();
   InputSource sheetSource = new InputSource(sheet);
   parser.parse(sheetSource); //解析excel的每条记录,在这个过程中startElement()、characters()、endElement()这三个函数会依次执行
   sheet.close();
  }
  return dataList;
 }
 
 /**
  * 第一个执行
  *
  * @param uri
  * @param localName
  * @param name
  * @param attributes
  * @throws SAXException
  */
 @Override
 public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
  //c => 单元格
  if ("c".equals(name)) {
 
   //前一个单元格的位置
   if (preRef == null) {
    preRef = attributes.getValue("r");
 
   } else {
    //中部文本空单元格标识 ‘endElementFlag' 判断前一次是否为文本空字符串,true则表明不是文本空字符串,false表明是文本空字符串跳过把空字符串的位置赋予preRef
    if (endElementFlag){
     preRef = ref;
    }
   }
 
   //当前单元格的位置
   ref = attributes.getValue("r");
   //首部文本空单元格标识 ‘startElementFlag' 判断前一次,即首部是否为文本空字符串,true则表明不是文本空字符串,false表明是文本空字符串, 且已知当前格,即第二格带“B”标志,则ref赋予preRef
   if (!startElementFlag && !flag){ //上一个单元格为文本空单元格,执行下面的,使ref=preRef;flag为true表明该单元格之前有数据值,即该单元格不是首部空单元格,则跳过
    // 这里只有上一个单元格为文本空单元格,且之前的几个单元格都没有值才会执行
    preRef = ref;
   }
 
   //设定单元格类型
   this.setNextDataType(attributes);
   endElementFlag = false;
   charactersFlag = false;
   startElementFlag = false;
  }
 
  //当元素为t时
  if ("t".equals(name)) {
   isTElement = true;
  } else {
   isTElement = false;
  }
 
  //置空
  lastIndex = "";
 }
 
 
 
 /**
  * 第二个执行
  * 得到单元格对应的索引值或是内容值
  * 如果单元格类型是字符串、INLINESTR、数字、日期,lastIndex则是索引值
  * 如果单元格类型是布尔值、错误、公式,lastIndex则是内容值
  * @param ch
  * @param start
  * @param length
  * @throws SAXException
  */
 @Override
 public void characters(char[] ch, int start, int length) throws SAXException {
  startElementFlag = true;
  charactersFlag = true;
  lastIndex += new String(ch, start, length);
 }
 
 /**
  * 第三个执行
  *
  * @param uri
  * @param localName
  * @param name
  * @throws SAXException
  */
 @Override
 public void endElement(String uri, String localName, String name) throws SAXException {
  //t元素也包含字符串
  if (isTElement) {
   //将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
   String value = lastIndex.trim();
   cellList.add(curCol, value);
   endElementFlag = true;
   curCol++;
   isTElement = false;
   //如果里面某个单元格含有值,则标识该行不为空行
   if (value != null && !"".equals(value)) {
    flag = true;
   }
  } else if ("v".equals(name)) {
   //v => 单元格的值,如果单元格是字符串,则v标签的值为该字符串在SST中的索引
   String value = this.getDataValue(lastIndex.trim(), "");//根据索引值获取对应的单元格值
 
   //补全单元格之间的空单元格
   if (!ref.equals(preRef)) {
    int len = countNullCell(ref, preRef);
    for (int i = 0; i < len; i++) {
     cellList.add(curCol, "");
     curCol++;
    }
   } else if (ref.equals(preRef) && !ref.startsWith("A")){ //ref等于preRef,且以B或者C...开头,表明首部为空格
    int len = countNullCell(ref, "A");
    for (int i = 0; i <= len; i++) {
     cellList.add(curCol, "");
     curCol++;
    }
   }
   cellList.add(curCol, value);
   curCol++;
   endElementFlag = true;
   //如果里面某个单元格含有值,则标识该行不为空行
   if (value != null && !"".equals(value)) {
    flag = true;
   }
  } else {
   //如果标签名称为row,这说明已到行尾,调用optRows()方法
   if ("row".equals(name)) {
    //默认第一行为表头,以该行单元格数目为最大数目
    if (curRow == 1) {
     maxRef = ref;
    }
    //补全一行尾部可能缺失的单元格
    if (maxRef != null) {
     int len = -1;
     //前一单元格,true则不是文本空字符串,false则是文本空字符串
     if (charactersFlag){
      len = countNullCell(maxRef, ref);
     }else {
      len = countNullCell(maxRef, preRef);
     }
     for (int i = 0; i <= len; i++) {
      cellList.add(curCol, "");
      curCol++;
     }
    }
 
    if (flag&&curRow!=1){ //该行不为空行且该行不是第一行,则发送(第一行为列名,不需要)
     //ExcelReaderUtil.sendRows(filePath, sheetName, sheetIndex, curRow, cellList);
     totalRows++;
     //添加到数据集合中
     dataList.add(cellList);
    }
 
    //清空容器
    cellList = new ArrayList<String>(); 
    //cellList.clear();
    curRow++;
    curCol = 0;
    preRef = null;
    prePreRef = null;
    ref = null;
    flag=false;
   }
  }
 }
 
 /**
  * 处理数据类型
  *
  * @param attributes
  */
 public void setNextDataType(Attributes attributes) {
  nextDataType = CellDataType.NUMBER; //cellType为空,则表示该单元格类型为数字
  formatIndex = -1;
  formatString = null;
  String cellType = attributes.getValue("t"); //单元格类型
  String cellStyleStr = attributes.getValue("s"); //
  String columnData = attributes.getValue("r"); //获取单元格的位置,如A1,B1
 
  if ("b".equals(cellType)) { //处理布尔值
   nextDataType = CellDataType.BOOL;
  } else if ("e".equals(cellType)) { //处理错误
   nextDataType = CellDataType.ERROR;
  } else if ("inlineStr".equals(cellType)) {
   nextDataType = CellDataType.INLINESTR;
  } else if ("s".equals(cellType)) { //处理字符串
   nextDataType = CellDataType.SSTINDEX;
  } else if ("str".equals(cellType)) {
   nextDataType = CellDataType.FORMULA;
  }
 
  if (cellStyleStr != null) { //处理日期
   int styleIndex = Integer.parseInt(cellStyleStr);
   XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
   formatIndex = style.getDataFormat();
   formatString = style.getDataFormatString();
   if (formatString.contains("m/d/yyyy") || formatString.contains("yyyy/mm/dd")|| formatString.contains("yyyy/m/d") ) {
    nextDataType = CellDataType.DATE;
    formatString = "yyyy-MM-dd hh:mm:ss";
   }
 
   if (formatString == null) {
    nextDataType = CellDataType.NULL;
    formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
   }
  }
 }
 
 /**
  * 对解析出来的数据进行类型处理
  * @param value 单元格的值,
  *    value代表解析:BOOL的为0或1, ERROR的为内容值,FORMULA的为内容值,INLINESTR的为索引值需转换为内容值,
  *    SSTINDEX的为索引值需转换为内容值, NUMBER为内容值,DATE为内容值
  * @param thisStr 一个空字符串
  * @return
  */
 @SuppressWarnings("deprecation")
 public String getDataValue(String value, String thisStr) {
  switch (nextDataType) {
   // 这几个的顺序不能随便交换,交换了很可能会导致数据错误
   case BOOL: //布尔值
    char first = value.charAt(0);
    thisStr = first == '0' ? "FALSE" : "TRUE";
    break;
   case ERROR: //错误
    thisStr = "\"ERROR:" + value.toString() + '"';
    break;
   case FORMULA: //公式
    thisStr = '"' + value.toString() + '"';
    break;
   case INLINESTR:
    XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
    thisStr = rtsi.toString();
    rtsi = null;
    break;
   case SSTINDEX: //字符串
    String sstIndex = value.toString();
    try {
     int idx = Integer.parseInt(sstIndex);
     XSSFRichTextString rtss = new XSSFRichTextString(sst.getEntryAt(idx));//根据idx索引值获取内容值
     thisStr = rtss.toString();
     //System.out.println(thisStr);
     //有些字符串是文本格式的,但内容却是日期
 
     rtss = null;
    } catch (NumberFormatException ex) {
     thisStr = value.toString();
    }
    break;
   case NUMBER: //数字
    if (formatString != null) {
     thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim();
    } else {
     thisStr = value;
    }
    thisStr = thisStr.replace("_", "").trim();
    break;
   case DATE: //日期
    thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString);
    // 对日期字符串作特殊处理,去掉T
    thisStr = thisStr.replace("T", " ");
    break;
   default:
    thisStr = " ";
    break;
  }
  return thisStr;
 }
 
 public int countNullCell(String ref, String preRef) {
  //excel2007最大行数是1048576,最大列数是16384,最后一列列名是XFD
  String xfd = ref.replaceAll("\\d+", "");
  String xfd_1 = preRef.replaceAll("\\d+", "");
 
  xfd = fillChar(xfd, 3, '@', true);
  xfd_1 = fillChar(xfd_1, 3, '@', true);
 
  char[] letter = xfd.toCharArray();
  char[] letter_1 = xfd_1.toCharArray();
  int res = (letter[0] - letter_1[0]) * 26 * 26 + (letter[1] - letter_1[1]) * 26 + (letter[2] - letter_1[2]);
  return res - 1;
 }
 
 public String fillChar(String str, int len, char let, boolean isPre) {
  int len_1 = str.length();
  if (len_1 < len) {
   if (isPre) {
    for (int i = 0; i < (len - len_1); i++) {
     str = let + str;
    }
   } else {
    for (int i = 0; i < (len - len_1); i++) {
     str = str + let;
    }
   }
  }
  return str;
 }
 
 /**
  * @return the exceptionMessage
  */
 public String getExceptionMessage() {
  return exceptionMessage;
 }
}

读取工具类:

import java.io.InputStream;
import java.util.List;
 
public class ExcelReaderUtil {
 
 //excel2003扩展名
 public static final String EXCEL03_EXTENSION = ".xls";
 //excel2007扩展名
 public static final String EXCEL07_EXTENSION = ".xlsx";
 
 //读取xls格式
 public static List<List<String>> readExcelXls(InputStream in) throws Exception {
  ExcelXlsReader excelXls=new ExcelXlsReader();
  List<List<String>> dataList =excelXls.process(in);
  return dataList;
 }
 
 //读取xlsx格式
 public static List<List<String>> readExcelXlsx(InputStream in) throws Exception {
  ExcelXlsxReader excelXls=new ExcelXlsxReader();
  List<List<String>> dataList =excelXls.process(in);
  return dataList;
 }
 
 public static void readExcel(String fileName) throws Exception {
  int totalRows =0;
  if (fileName.endsWith(EXCEL03_EXTENSION)) { //处理excel2003文件
   ExcelXlsReader excelXls=new ExcelXlsReader();
   totalRows =excelXls.process(fileName);
  } else if (fileName.endsWith(EXCEL07_EXTENSION)) {//处理excel2007文件
   ExcelXlsxReader excelXlsxReader = new ExcelXlsxReader();
   totalRows = excelXlsxReader.process(fileName);
  } else {
   throw new Exception("文件格式错误,fileName的扩展名只能是xls或xlsx。");
  }
  System.out.println("发送的总行数:" + totalRows);
 }
 
 public static void main(String[] args) throws Exception {
  String path="C:\\test.xlsx";
  ExcelReaderUtil.readExcel(path);
 }
}

Copyright 2022 版权所有 软件发布 访问手机版

声明:所有软件和文章来自软件开发商或者作者 如有异议 请与本站联系 联系我们