Specific programming scenarios of spark SQL

Posted by tigomark on Sun, 01 Dec 2019 01:17:03 +0100

Introduction case:

object SparkSqlTest {
    def main(args: Array[String]): Unit = {
        //Block redundant logs
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //Building programming portals
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")
        val spark: SparkSession = SparkSession.builder().config(conf)
            .getOrCreate()

        /**
          * Note after spark 2.0:
          * val sqlContext = new SQLContext(sparkContext)
          * val hiveContext = new HiveContext(sparkContext)
          * The main constructor is privatized, so only the SparkSession object can be used here
          */
        //Create sqlcontext object
        val sqlContext: SQLContext = spark.sqlContext
        //The load data is DataFrame, and the load here is json data
        //Data format: {name:'',age:18}
        val perDF: DataFrame = sqlContext.read.json("hdfs://zzy/data/person.json")

        //View 2D table structure
        perDF.printSchema()

        //View data, display 20 records by default
        perDF.show()

        //Complex queries
        perDF.select("name").show() //Specify fields to query
        perDF.select(new Column("name"),new Column("age").>(18)).show()  //Specify query criteria to query
        perDF.select("name","age").where(new Column("age").>(18)).show() //Specify query criteria to query
        perDF.select("age").groupBy("age").avg("age") //Aggregation operation
    }
}

If you don't know much about the introduction case, next step by step introduction:

(1) conversion between RDD/DataSet//DataFrame/list

There are two ways to convert from RDD to DataFrame/DataSet:
                     
                      
   note: if the dataFrame corresponds to a java bean, if the dataSet corresponds to a case class

Convert RDD or external collection to dataframe/datasets by reflection

Data preparation:

case class Student(name:String, birthday:String, province:String)
val stuList = List(
      new Student("committee xx", "1998-11-11", "Shanxi"),
      new Student("Wu xx", "1999-06-08", "Henan"),
      new Student("Qi xx", "2000-03-08", "Shandong"),
      new Student("king xx", "1997-07-09", "Anhui"),
      new Student("Xue xx", "2002-08-09", "Liaoning")
    )

list --> DataFrame:

        //Block redundant logs
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //Building programming portals
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")
            .set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
            .registerKryoClasses(Array(classOf[Student]))
        val spark: SparkSession = SparkSession.builder().config(conf)
            .getOrCreate()

        //Create sqlcontext object
        val sqlContext: SQLContext = spark.sqlContext

        /**
          * list--->DataFrame
          * Converting scala collections to java collections
          */
        val javaList: util.List[Student] = JavaConversions.seqAsJavaList(stuList)
        val stuDF: DataFrame = sqlContext.createDataFrame(javaList,classOf[Student])
        val count = stuDF.count()
        println(count)

RDD --> DataFrame:

        //Block redundant logs
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //Building programming portals
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")
            .set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
            .registerKryoClasses(Array(classOf[Student]))
        val spark: SparkSession = SparkSession.builder().config(conf)
            .getOrCreate()

        //Create sqlcontext object
        val sqlContext: SQLContext = spark.sqlContext
        //Create sparkContext
        val sc: SparkContext = spark.sparkContext
        /**
          * RDD--->DataFrame
          */
        val stuRDD: RDD[Student] = sc.makeRDD(stuList)
        val stuDF: DataFrame = sqlContext.createDataFrame(stuRDD,classOf[Student])
        val count = stuDF.count()
        println(count)

list --> DataSet:

        //Block redundant logs
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //Building programming portals
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")
            .set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
            .registerKryoClasses(Array(classOf[Student]))
        val spark: SparkSession = SparkSession.builder().config(conf)
            .getOrCreate()

        //Create sqlcontext object
        val sqlContext: SQLContext = spark.sqlContext
        //Create sparkContext
        val sc: SparkContext = spark.sparkContext
        /**
          * list--->DataSet
          */
        //If you create a Dataset, you must import the following implicit transformation
        import spark.implicits._
        val stuDF: Dataset[Student] = sqlContext.createDataset(stuList)
        stuDF.createTempView("student")
        //Use the complete sql statement to query, and use the reflection method. Only Dataset can query, not dataFrame
        val sql=
            """
              |select * from student
            """.stripMargin
        spark.sql(sql).show()


RDD --> DataSet:

        //Block redundant logs
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //Building programming portals
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")
            .set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
            .registerKryoClasses(Array(classOf[Student]))
        val spark: SparkSession = SparkSession.builder().config(conf)
            .getOrCreate()

        //Create sqlcontext object
        val sqlContext: SQLContext = spark.sqlContext
        //Create sparkContext
        val sc: SparkContext = spark.sparkContext
        /**
          * RDD--->DataSet
          */
        //If you create a Dataset, you must import the following implicit transformation
        import spark.implicits._
        val stuRDD: RDD[Student] = sc.makeRDD(stuList)
        val stuDF: Dataset[Student] = sqlContext.createDataset(stuRDD)
        stuDF.createTempView("student")
        //Use the complete sql statement to query, and use the reflection method. Only Dataset can query, not dataFrame
        val sql=
            """
              |select * from student
            """.stripMargin
        spark.sql(sql).show()

Dynamically translate external collections or RDD S into data frames or dataset s by programming

list --> DataFrame:

        //Block redundant logs
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //Building programming portals
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")
            .set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
            .registerKryoClasses(Array(classOf[Student]))
        val spark: SparkSession = SparkSession.builder().config(conf)
            .getOrCreate()

        //Create sqlcontext object
        val sqlContext: SQLContext = spark.sqlContext
        //Create sparkContext
        val sc: SparkContext = spark.sparkContext

        //list-DataFrame

        //1. Convert all elements in the list to Row
        val RowList: List[Row] = stuList.map(item => {
            Row(item.name, item.birthday, item.province)
        })
        //2. Build metadata
        val schema=StructType(List(
            StructField("name",DataTypes.StringType),
            StructField("birthday",DataTypes.StringType),
            StructField("province",DataTypes.StringType)
        ))
        //Convert the collection of scala to java collection
        val javaList = JavaConversions.seqAsJavaList(RowList)
        val stuDF = spark.createDataFrame(javaList,schema)
        stuDF.createTempView("student")
        //Use the complete sql statement to query, and use the dynamic programming method. Both Dataset and dataFrame can
        val sql=
            """
              |select * from student
            """.stripMargin
        spark.sql(sql).show()

RDD--> DataFrame:

        //Block redundant logs
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //Building programming portals
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")
            .set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
            .registerKryoClasses(Array(classOf[Student]))
        val spark: SparkSession = SparkSession.builder().config(conf)
            .getOrCreate()

        //Create sqlcontext object
        val sqlContext: SQLContext = spark.sqlContext
        //Create sparkContext
        val sc: SparkContext = spark.sparkContext

        //RDD-DataFrame

        //Convert elements in RDD to Row
        val RowRDD: RDD[Row] = sc.makeRDD(stuList).map(item => {
            Row(item.name, item.birthday, item.province)
        })

        //2. Build metadata
        val schema=StructType(List(
            StructField("name",DataTypes.StringType),
            StructField("birthday",DataTypes.StringType),
            StructField("province",DataTypes.StringType)
        ))
        val stuDF = spark.createDataFrame(RowRDD,schema)
        stuDF.createTempView("student")
        //Use the complete sql statement to query, and use the dynamic programming method. Both Dataset and dataFrame can
        val sql=
            """
              |select * from student
            """.stripMargin
        spark.sql(sql).show()

Because building DataFrame is exactly the same as building DataSet, this is not a demonstration.

(2) how to load data with spark SQL

        //Block redundant logs
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //Building programming portals
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
                .setMaster("local[2]")

        val spark: SparkSession = SparkSession.builder().config(conf)
                .getOrCreate()

        //Create sqlcontext object
        val sqlContext: SQLContext = spark.sqlContext
        //Create sparkContext
        val sc: SparkContext = spark.sparkContext

        //Previous versions loading: parquet files
        sqlContext.load("hdfs://zzy/hello.parquet")
        //Load json data
        sqlContext.read.json("hdfs://zzy/hello.json")
        //Load normal file
        sqlContext.read.text("hdfs://zzy/hello.txt")
        //Loading csv
        sqlContext.read.csv("hdfs://zy/hello.csv")
        //Reading jdbc data
        val url="jdbc:mysql://localhost:3306/hello"
        val properties=new Properties()
        properties.setProperty("user","root")
        properties.setProperty("password","123456")
        val tableName="book"
        sqlContext.read.jdbc(url,tableName,properties)

(3) how to land spark SQL data

        //Block redundant logs
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //Building programming portals
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
                .setMaster("local[2]")

        val spark: SparkSession = SparkSession.builder().config(conf)
                .getOrCreate()

        //Create sqlcontext object
        val sqlContext: SQLContext = spark.sqlContext
        //Create sparkContext
        val sc: SparkContext = spark.sparkContext
        val testFD: DataFrame = sqlContext.read.text("hdfs://zzy/hello.txt")

        //Write to normal file
        testFD.write.format("json") //In what format
                .mode(SaveMode.Append)  //Writing mode
                .save("hdfs://zzy/hello.json ") / / file location written

        //Write to database
        val url="jdbc:mysql://localhost:3306/hello"
        val table_name="book"
        val prots=new Properties()
        prots.put("user","root")
        prots.put("password","123456")
        testFD.write.mode(SaveMode.Append).jdbc(url,table_name,prots)

Topics: Big Data Spark Apache SQL Programming