Some related tests were carried out for spark: wordcount test for spark, feasible test for spark streaming, and test for Kafka message production.
6.1 spark word count test
The spark test case is used to test whether spark can be operated or not.
import org.apache.spark.{SparkConf, SparkContext} object ScalaPi { def main(args: Array[String]): Unit = { //Create SparkConf() and set the name of App val conf = new SparkConf() .setAppName("ScalaPi.scala") .setMaster(master = "yarn-client"); //Create SparkContext, which is the entry to spark app. val sc = new SparkContext(conf); val res=sc.textFile(args(0)) .flatMap(_.split(" ")) .map((_ ,1)) .reduceByKey(_ + _,4) .sortBy(_._2,false); res.collect().foreach(println(_)); res.saveAsTextFile(args(1))//args(1) set the local path in the hue task submission platform //Stop sc and end the task sc.stop(); }}
6.2 Kafka Data Generation Test
Kafka message generator is used to test Kafka message generator and message consumption.
import java.io.PrintWriter import java.text.DecimalFormat import java.util.Properties import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} import scala.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{Milliseconds, StreamingContext} object make_data { def main(args: Array[String]): Unit = { val props = new Properties() props.put("bootstrap.servers","47.103.10.241:9093") props.put("acks","all") props.put("retries","0") props.put("batch.size","2048") props.put("linger.ms","1") props.put("buffer.memory","4096") props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer") props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer") val producer = new KafkaProducer[String, String](props) println("see this is ok") //val write_logs=new PrintWriter(args(0)) val users = Array("jack","leo","andy","lucy","jim","smith","iverson","andrew") val pages = Array("iphone4.html","huawei.html","mi.html","mac.html","note.html","book.html","fanlegefan.com") val df = new DecimalFormat("#.00") val random = new Random() val num = 10 println("see this is ok,too") for(i<- 0 to num ){ val message = users(random.nextInt(users.length))+","+pages(random.nextInt(pages.length))+ ","+df.format(random.nextDouble()*1000)+","+System.currentTimeMillis() producer.send(new ProducerRecord[String, String]("test", Integer.toString(i),message)) println(message) //write_logs.println(message+"\n") } println("woo!see this is ok,too") producer.close()
}
}
In the test of Kafka's message generator, there is no problem with the whole code and data generation. The problem is that when uploading data to the port, it is impossible to establish the relevant nodes.
6.3 spark streaming test demo
import java.io.PrintWriter import java.text.SimpleDateFormat import java.util.Date import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object streaming_count { def main(args: Array[String]): Unit = { val df = new SimpleDateFormat("yyyyMMdd") val group = "test" val topics = "test" val sparkConf = new SparkConf().setAppName("streaming_count").setMaster("yarn-client") val sc = new SparkContext(sparkConf) val ssc = new StreamingContext(sc, Seconds(10)) val topicSets = topics.split(",").toSet val kafkaParams = Map[String, String]( "metadata.broker.list"-> "47.103.10.241:9093", "group.id"-> group ) val kafkastream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicSets) val write_log=new PrintWriter(args(0)) val envents=kafkastream.map(tuple=> { val arr = tuple._2 val line=arr.split(",") val user=line(0) val page=line(1) val money=line(2)+2 val day=df.format(new Date(line(3).toLong)) println(arr) write_log.println(user+","+page+","+money+","+day+"\n") }) ssc.start() ssc.awaitTermination()}}