Java + spark SQL query excel

Posted by GrayFox12 on Tue, 03 Dec 2019 04:21:44 +0100

Download Spark on Spark official website

Spark Download The version is free. After downloading, extract it and put it under bigdata (the directory can be changed)

Download the file winutils.exe required by Hadoop under Windows

Let's find it on the Internet. It won't be uploaded here. In fact, this file is optional, and error reporting doesn't affect Spark's operation. Obsessive-compulsive disorder can be downloaded, so I have obsessive-compulsive disorder ~ ~. After downloading, the file will be placed in bigdata\hadoop\bin directory.
You do not need to create environment variables, and then define system variables at the beginning of Java, as follows:

System.setProperty("hadoop.home.dir", HADOOP_HOME);

Create Java Maven project Java spark SQL Excel

The establishment of related directory levels is as follows:

Parent directory (the directory where the project is located)
    - java-spark-sql-excel
    - bigdata
      - spark
      - hadoop
        - bin
          - winutils.exe

Code

Initialize SparkSession

static{
    System.setProperty("hadoop.home.dir", HADOOP_HOME);
    spark = SparkSession.builder()
            .appName("test")
            .master("local[*]") 
            .config("spark.sql.warehouse.dir",SPARK_HOME)
            .config("spark.sql.parquet.binaryAsString", "true")
            .getOrCreate();
     }

Read excel

public static void readExcel(String filePath,String tableName) throws IOException{
        DecimalFormat format = new DecimalFormat(); 
        format.applyPattern("#");
        //create a file(Can receive uploaded files, springmvc Use CommonsMultipartFile,jersey have access to org.glassfish.jersey.media.multipart.FormDataParam(Upload blog by referring to my file))
        File file = new File(filePath);
        //Create a file stream
        InputStream inputStream = new FileInputStream(file);
        //Create a buffer for the stream
        BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
        //Definition Excel workbook Quote
        Workbook  workbook =null;
        //.xlsx Format file use XSSFWorkbook Subclass xls Format file use HSSFWorkbook
        if(file.getName().contains("xlsx")) workbook = new XSSFWorkbook(bufferedInputStream);
        if(file.getName().contains("xls")&&!file.getName().contains("xlsx"))  workbook = new HSSFWorkbook(bufferedInputStream);
        System.out.println(file.getName());
        //Obtain Sheets iterator
        Iterator<Sheet> dataTypeSheets= workbook.sheetIterator();
        while(dataTypeSheets.hasNext()){
            //Every last sheet Is a table for each sheet
            ArrayList<String> schemaList = new ArrayList<String>();
             // dataList data set
            ArrayList<org.apache.spark.sql.Row> dataList = new ArrayList<org.apache.spark.sql.Row>();
            //field
            List<StructField> fields = new ArrayList<>();
            //Get current sheet
            Sheet   dataTypeSheet = dataTypeSheets.next();
            //Get first row as field
            Iterator<Row> iterator = dataTypeSheet.iterator();
            //No next sheet skip
            if(!iterator.hasNext()) continue;
            //Get the first row to build the table structure
            Iterator<Cell> firstRowCellIterator = iterator.next().iterator();
             while(firstRowCellIterator.hasNext()){
                 //Get each column in the first row as a field
                 Cell currentCell = firstRowCellIterator.next();
                 //Character string
                 if(currentCell.getCellTypeEnum() == CellType.STRING) schemaList.add(currentCell.getStringCellValue().trim());
                 //numerical value
                 if(currentCell.getCellTypeEnum() == CellType.NUMERIC)  schemaList.add((currentCell.getNumericCellValue()+"").trim());
             }
             //Establish StructField(spark For the field object in, you need to provide the field name, field type and the third parameter true Indicates that the column can be empty)And fill in List<StructField>
             for (String fieldName : schemaList) {
               StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
               fields.add(field);
             }
             //according to List<StructField>Establish spark Table structure org.apache.spark.sql.types.StructType
            StructType schema = DataTypes.createStructType(fields);
            //Number of fields len
            int len = schemaList.size();
            //Get current sheet Data rows
            int rowEnd = dataTypeSheet.getLastRowNum(); 
            //Ergodic current sheet All rows
            for (int rowNum = 1; rowNum <= rowEnd; rowNum++) {  
               //Make a row of data into one List
               ArrayList<String> rowDataList = new ArrayList<String>();
               //Get a row of data
               Row r = dataTypeSheet.getRow(rowNum); 
               if(r!=null){
                   //Traverses the cells of the current row based on the number of fields
                   for (int cn = 0; cn < len; cn++) {  
                      Cell c = r.getCell(cn, Row.MissingCellPolicy.RETURN_BLANK_AS_NULL);  
                      if (c == null)  rowDataList.add("0");//Null value simple zero filling
                      if (c != null&&c.getCellTypeEnum() == CellType.STRING)  rowDataList.add(c.getStringCellValue().trim());//Character string
                      if (c != null&&c.getCellTypeEnum() == CellType.NUMERIC){
                         double value = c.getNumericCellValue(); 
                         if (p.matcher(value+"").matches())  rowDataList.add(format.format(value));//Do not retain decimal point
                         if (!p.matcher(value+"").matches()) rowDataList.add(value+"");//Keep decimal point
                      }
                      }  
                   }  
                //dataList Add a row to the dataset
                dataList.add(RowFactory.create(rowDataList.toArray()));
               }
            //Create temporary tables based on data and table structure
            spark.createDataFrame(dataList, schema).createOrReplaceTempView(tableName+dataTypeSheet.getSheetName());
            }            
    }

Create a test file in the project directory

First Sheet:

Second Sheet:

The third Sheet:

Testing

public static void main(String[] args) throws Exception {
        //Need to query excel Route
        String xlsxPath = "test2.xlsx";
        String xlsPath  = "test.xls";
        //Define table name
        String tableName1="test_table1";        
        String tableName2="test_table2";        
        //read excel Table name is tableNameN+Sheet Name
        readExcel(xlsxPath,tableName2);
        spark.sql("select * from "+tableName2+"Sheet1").show();
        
        readExcel(xlsPath,tableName1);
        spark.sql("select * from "+tableName1+"Sheet1").show();
        spark.sql("select * from "+tableName1+"Sheet2").show();
        spark.sql("select * from "+tableName1+"Sheet3").show();
    }

Operation result

Related dependence

<dependencies>
     <dependency>
        <groupId>org.spark-project.hive</groupId>
        <artifactId>hive-jdbc</artifactId>
        <version>1.2.1.spark2</version>
     </dependency>
     <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
        <version>2.3.1</version>
     </dependency>   
     <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.11</artifactId>
        <version>2.3.1</version>
     </dependency>    
     <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>2.6.0</version>
     </dependency>
     <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>3.17</version>
     </dependency>
      <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.17</version>
     </dependency> 
   </dependencies>

I GitHub

Topics: Java Spark SQL Apache Hadoop