Spark and spark sreaming related test demo

Posted by Niko on Tue, 01 Oct 2019 06:50:31 +0200

Some related tests were carried out for spark: wordcount test for spark, feasible test for spark streaming, and test for Kafka message production.

6.1 spark word count test

The spark test case is used to test whether spark can be operated or not.

import org.apache.spark.{SparkConf, SparkContext}
object ScalaPi {
 def main(args: Array[String]): Unit = {
//Create SparkConf() and set the name of App
val conf = new SparkConf()
  .setAppName("ScalaPi.scala")
  .setMaster(master = "yarn-client");

//Create SparkContext, which is the entry to spark app.
val sc = new SparkContext(conf);
 val res=sc.textFile(args(0))
  .flatMap(_.split(" "))
  .map((_ ,1))
  .reduceByKey(_ + _,4)
  .sortBy(_._2,false);
res.collect().foreach(println(_));
res.saveAsTextFile(args(1))//args(1) set the local path in the hue task submission platform
//Stop sc and end the task
sc.stop(); }}

6.2 Kafka Data Generation Test

Kafka message generator is used to test Kafka message generator and message consumption.

import java.io.PrintWriter
import java.text.DecimalFormat
import java.util.Properties

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}

import scala.util.Random
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Milliseconds, StreamingContext}

object make_data {

def main(args: Array[String]): Unit = {

val props = new Properties()
props.put("bootstrap.servers","47.103.10.241:9093")
props.put("acks","all")
props.put("retries","0")
props.put("batch.size","2048")
props.put("linger.ms","1")
props.put("buffer.memory","4096")
props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer")

val producer = new KafkaProducer[String, String](props)
println("see this is ok")
//val write_logs=new PrintWriter(args(0))
val users = Array("jack","leo","andy","lucy","jim","smith","iverson","andrew")
val pages = Array("iphone4.html","huawei.html","mi.html","mac.html","note.html","book.html","fanlegefan.com")
val df = new DecimalFormat("#.00")
val random = new Random()
val num = 10
println("see this is ok,too")
for(i<- 0 to num ){
  val message = users(random.nextInt(users.length))+","+pages(random.nextInt(pages.length))+
    ","+df.format(random.nextDouble()*1000)+","+System.currentTimeMillis()
    producer.send(new ProducerRecord[String, String]("test", Integer.toString(i),message))
  println(message)
  //write_logs.println(message+"\n")

}
println("woo!see this is ok,too")
producer.close()

}
}

In the test of Kafka's message generator, there is no problem with the whole code and data generation. The problem is that when uploading data to the port, it is impossible to establish the relevant nodes.

6.3 spark streaming test demo

import java.io.PrintWriter
import java.text.SimpleDateFormat
import java.util.Date
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object streaming_count {
def main(args: Array[String]): Unit = {
val df = new SimpleDateFormat("yyyyMMdd")
val group = "test"
val topics = "test"

val sparkConf = new SparkConf().setAppName("streaming_count").setMaster("yarn-client")

val sc = new SparkContext(sparkConf)
val ssc = new StreamingContext(sc, Seconds(10))
val topicSets = topics.split(",").toSet
val kafkaParams = Map[String, String](
  "metadata.broker.list"-> "47.103.10.241:9093",
  "group.id"-> group
)
val kafkastream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc,
  kafkaParams, topicSets)

val write_log=new PrintWriter(args(0))
val envents=kafkastream.map(tuple=> {
  val arr = tuple._2
  val line=arr.split(",")
  val user=line(0)
  val page=line(1)
  val money=line(2)+2
  val day=df.format(new Date(line(3).toLong))
  println(arr)
  write_log.println(user+","+page+","+money+","+day+"\n")
})

ssc.start()
ssc.awaitTermination()}}

Topics: Spark kafka Apache Java