spark Learning Notes - core operators

Posted by Mikester on Wed, 22 Sep 2021 17:46:44 +0200

spark Learning Notes - core operator (2)

distinct operator

  /**
   * Return a new RDD containing the distinct elements in this RDD.
   */
  def distinct(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope {
    def removeDuplicatesInPartition(partition: Iterator[T]): Iterator[T] = {
      // Create an instance of external append only map which ignores values.
      val map = new ExternalAppendOnlyMap[T, Null, Null](
        createCombiner = _ => null,
        mergeValue = (a, b) => a,
        mergeCombiners = (a, b) => a)
      map.insertAll(partition.map(_ -> null))
      map.iterator.map(_._1)
    }
    partitioner match {
      case Some(_) if numPartitions == partitions.length =>
        mapPartitions(removeDuplicatesInPartition, preservesPartitioning = true)
      // Generally, the following process will be followed to perform the reduceByKey operation. For repeated elements, only the first element will be returned
      case _ => map(x => (x, null)).reduceByKey((x, _) => x, numPartitions).map(_._1)
    }
  }
    //Use reduceByKey to realize the function of distinct
    val result: RDD[Int] = nums.map((_, null)).reduceByKey((x, y) => x).map(_._1)
    result.foreach(println)

cogroup operator

   * For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
   * list of values for that key in `this` as well as `other`.
   */
  def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))] = self.withScope {
    cogroup(other, defaultPartitioner(self, other))
  }


  def cogroup[W](other: RDD[(K, W)], partitioner: Partitioner)
      : RDD[(K, (Iterable[V], Iterable[W]))] = self.withScope {
    if (partitioner.isInstanceOf[HashPartitioner] && keyClass.isArray) {
      throw new SparkException("HashPartitioner cannot partition array keys.")
    }
    val cg = new CoGroupedRDD[K](Seq(self, other), partitioner)
    cg.mapValues { case Array(vs, w1s) =>
      (vs.asInstanceOf[Iterable[V]], w1s.asInstanceOf[Iterable[W]])
    }
  }

intersection operator

  /**
   * Return the intersection of this RDD and another one. The output will not contain any duplicate
   * elements, even if the input RDDs did.
   *
   * @note This method performs a shuffle internally.
   */
  def intersection(other: RDD[T]): RDD[T] = withScope {
    this.map(v => (v, null)).cogroup(other.map(v => (v, null)))
        .filter { case (_, (leftGroup, rightGroup)) => leftGroup.nonEmpty && rightGroup.nonEmpty }
        .keys
  }

Using cogroup to implement intersect operator

    val rdd1: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4, 5))
    val rdd2: RDD[Int] = sc.makeRDD(List(4, 5, 6, 7, 8))


    val rdd3: RDD[(Int, Null)] = rdd1.map((_, null))
    val rdd4: RDD[(Int, Null)] = rdd2.map((_, null))

    val grouped: RDD[(Int, (Iterable[Null], Iterable[Null]))] = rdd3.cogroup(rdd4)
    val res: RDD[Int] = grouped.filter(
      t => t._2._1.nonEmpty && t._2._2.nonEmpty
    ).keys

    val resultRDD: RDD[Int] = rdd1.intersection(rdd2)

Using cogroup to implement join operator

val conf: SparkConf = new SparkConf().setAppName("WordCount").setMaster("local")
val sc = new SparkContext(conf)

val rdd1: RDD[(String, Int)] = sc.makeRDD(List(("tom", 1), ("tom", 2), ("jerry", 3), ("ketty", 2)))
val rdd2: RDD[(String, Int)] = sc.makeRDD(List(("jerry", 1), ("tom", 2), ("shuke", 2)))

val rdd3: RDD[(String, (Iterable[Int], Iterable[Int]))] = rdd1.cogroup(rdd2)

val result: RDD[(String, (Int, Int))] = rdd1.join(rdd2)

//Using cogroup to achieve the effect of join
val rdd4: RDD[(String, (Int, Int))] = rdd3.flatMapValues(t => {
  for (x <- t._1.iterator; y <- t._2.iterator) yield (x, y)
})
val rdd1: RDD[(String, Int)] = sc.makeRDD(List(("tom", 1), ("tom", 2), ("jerry", 3), ("ketty", 2)))
val rdd2: RDD[(String, Int)] = sc.makeRDD(List(("jerry", 1), ("tom", 2), ("shuke", 2)))

val rdd3: RDD[(String, (Iterable[Int], Iterable[Int]))] = rdd1.cogroup(rdd2)

val result: RDD[(String, (Int, Int))] = rdd1.join(rdd2)

//join using cogroup
val rdd4: RDD[(String, (Int, Int))] = rdd3.flatMapValues(t => {
  for (x <- t._1.iterator; y <- t._2.iterator) yield (x, y)
})


val leftJoinRDD: RDD[(String, (Int, Option[Int]))] = rdd1.leftOuterJoin(rdd2)
leftJoinRDD.collect().foreach(println)
//leftJoin using cogroup
val rdd5: RDD[(String, (Int, Option[Int]))] = rdd3.flatMapValues((t: (Iterable[Int], Iterable[Int])) => {
  if (t._2.isEmpty) {
    //T_ 1 there may be multiple elements or empty that need map operation
    t._1.map((_, None))
  } else {
    for (x <- t._1.iterator; y <- t._2.iterator) yield (x, Some(y))
  }
})

//Using cogroup to implement rightOuterJoin
val value: RDD[(String, (Option[Int], Int))] = rdd1.rightOuterJoin(rdd2)
val value1: RDD[(String, (Option[Int], Int))] = rdd3.flatMapValues(
  t => {
    if (t._1.isEmpty) {
      t._2.map((None, _))
    } else {
      for (x <- t._1.iterator; y <- t._2.iterator) yield (Some(x), y)
    }
  }
)
value.collect().foreach(println)

// Implement fullOuterJoin using cogroup operator
val fullOuterJoinRDD: RDD[(String, (Option[Int], Option[Int]))] = rdd3.flatMapValues {
  case (i1, Seq()) => i1.iterator.map(x => (Some(x), None))
  case (Seq(), i2) => i2.iterator.map(x => (None, Some(x)))
  case (i1, i2) => for (a <- i1.iterator; b <- i2.iterator) yield (Some(a), Some(b))
}

count operator

/**
 * Run a job on all partitions in an RDD and return the results in an array.
 *
 * @param rdd target RDD to run tasks on
 * @param func a function to run on each partition of the RDD
 * @return in-memory collection with a result of the job (each collection element will contain
 * a result from one partition)
 */
// The array here contains the calculation results of each partition
def runJob[T, U: ClassTag](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
  runJob(rdd, func, 0 until rdd.partitions.length)
}

cache operation in rdd

  • An application triggers an action many times. In order to take the data calculated by the RDD, avoid repeatedly reading the data in HDFS (data source) or repeated calculation
  • Cache, you can cache data to memory or disk (the disk where the executor is located). The action is triggered for the first time before it is put into memory or disk. Later, the action will read the cached RDD data for operation and reuse the cached data
  • Only when an RDD triggers the action cache multiple times can it make sense
  • If the data is cached in memory and the memory is insufficient, only partial partitioned data will be cached in partition units
  • It supports multiple storagelevels and can serialize data. By default, it is stored in memory using java objects. However, it takes up a lot of space and has the advantage of high speed. Other serialization methods can also be used
  • The underlying cache calls the persist method, which can specify other storage levels
  • Strictly speaking, the cache and persist methods are not Transformation, because no new rdd is generated, but the current rdd needs cache or persist
  • It is better to cache or persist the original data after sorting and filtering

checkpoint operation in rdd

  • Usage scenario: suitable for complex computing (machine learning, iterative computing). In order to avoid data loss and repeated computing, valuable intermediate results can be saved in hdfs to ensure the safety of intermediate results

  • Before calling the checkpoint method of rdd, be sure to specify the checkpoint directory, namely sc.setCheckPointDir

  • In order to ensure the security of intermediate results, saving the data in HDFS and distributed file system can ensure that the data is not lost

  • When the action is triggered for the first time, the checkpoint is made. An additional job will be triggered. The parent of this job is to save the intermediate results in HDFS

  • If the rdd makes a checkpoint, the dependency relationship before the rdd will no longer be used

  • Only when multiple action s are triggered can checkpoint be meaningful and used for iterative calculation

  • Strictly speaking, checkpoint is not transformation, but marks the current rdd as a checkpoint

  • If you cache rdd before checkpoint, you can avoid repeated data calculation. If you have cached data, you will give priority to cache and no longer use checkpoint. If the checkpoint is too long and the data saved in hdfs is lost, you will report an error when operating the relevant data

Count the users who have logged in for three consecutive days or more

This problem can be extended to many similar problems: recharging members for several consecutive months, selling goods for consecutive days, playing didi continuously and overdue continuously

Test data: user id, login date

raw data

guid01,2018-02-28
guid01,2018-03-01
guid01,2018-03-02
guid01,2018-03-04
guid01,2018-03-05
guid01,2018-03-06
guid01,2018-03-07
guid02,2018-03-01
guid02,2018-03-02
guid02,2018-03-03
guid02,2018-03-06
object UserContinuedLogin {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getCanonicalName).setMaster("local[*]")
    val sc = new SparkContext(conf)
    val rdd: RDD[String] = sc.textFile("data/login.log")

    // Convert to format of type (uid,date)
    val mapRDD: RDD[(String, String)] = rdd.map(line => {
      val strings: Array[String] = line.split(",")
      (strings(0), strings(1))
    })

    val groupRDD: RDD[(String, Iterable[String])] = mapRDD.groupByKey()

    val flatMapRDD: RDD[(String, (String, String))] = groupRDD.flatMapValues(it => {
      val sorted: List[String] = it.toSet.toList.sortBy((x: String) => x)
      val calendar: Calendar = Calendar.getInstance()
      var sdf = new SimpleDateFormat("yyyy-MM-dd")
      var index = 0
      sorted.map(dateStr => {
        // Subtract the line number. If the results are the same, it indicates continuous login
        val date: Date = sdf.parse(dateStr)
        calendar.setTime(date)
        calendar.add(Calendar.DATE, -index)
        index += 1
        (dateStr, sdf.format(calendar.getTime))
      })
    })
    
    val result: RDD[(String, Int, String, String)] = flatMapRDD.map(t => ((t._1, t._2._2), t._2._1)).groupByKey().mapValues(it => {
      val list = it.toList.sorted
      val times = list.size
      val beginTime = list.head
      val endTime = list.last
      (times, beginTime, endTime)
    }).filter(t => t._2._1 >= 3).map(t => {
      (t._1._1, t._2._1, t._2._2, t._2._3)
    })
    println(result.collect().toBuffer)
  }
}

Count the top three most popular teachers in each subject

The raw data are as follows

http://bigdata.edu360.cn/laozhang
http://bigdata.edu360.cn/laozhang
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao

Sort directly to list

    val sc: SparkConf = new SparkConf().setMaster("local").setAppName("TopTeacher");
    val context = new SparkContext(sc)
    val rdd: RDD[String] = context.textFile("data/teacher.log")
    val mapRdd: RDD[((String, String), Int)] = rdd.map({
      line: String => {
        val strings: Array[String] = line.split("/")
        val teacher: String = strings(3)
        val course: String = strings(2).split("\\.")(0)
        ((course, teacher), 1)
      }
    })
    val reduceRdd: RDD[((String, String), Int)] = mapRdd.reduceByKey(_ + _)
    val groupRdd: RDD[(String, Iterable[((String, String), Int)])] = reduceRdd.groupBy(_._1._1)
    va l result: RDD[(String, List[((String, String), Int)])] = 								groupRdd.mapValues(_.toList.sortBy(_._2).reverse.take(3))

Find topN after filtering

    val sc: SparkConf = new SparkConf().setMaster("local").setAppName("TopTeacher");
    val context = new SparkContext(sc)
    val rdd: RDD[String] = context.textFile("data/teacher.log")
    val mapRdd: RDD[((String, String), Int)] = rdd.map({
      line: String => {
        val strings: Array[String] = line.split("/")
        val teacher: String = strings(3)
        val course: String = strings(2).split("\\.")(0)
        ((course, teacher), 1)
      }
    })
		
    val reduceRdd: RDD[((String, String), Int)] = mapRdd.reduceByKey(_ + _)
       
		val subject = List("bigdata", "javaee", "kafka", "hive")

    for (sb <- subject){
      val filtered: RDD[((String, String), Int)] = reduceRdd.filter((_: ((String, String), Int))._1._1 == sb)
      val favTeacher: Array[((String, String), Int)] = filtered.sortBy((_: ((String, String), Int))._2, ascending = false).take(3)
      //The sortBy above uses the global sorting method. In fact, it is not necessary to perform global sorting to get the TopN
      implicit val orderRules:Ordering[((String,String),Int)] = Ordering[Int].on(t => t._2)
      //The following implicit parameters are required during the top operation
      val res = reduceRdd.top(2)
      print(res.toBuffer)
    }
//For reduceRdd, repartition can be performed to recombine different partitions,
// Make the data of different disciplines in different groups, and there will be no data skew

// First, count the number of different disciplines and put them in an array
val subjects: Array[String] = reduceRdd.map(_._1._2).distinct().collect()
val subjectPartitioner = new SubjectPartitioner(subjects)
val partitioned: RDD[((String, String), Int)] = reduceRdd.partitionBy(subjectPartitioner)

//Take topN for the data re partitioned by the user-defined partitioner
class SubjectPartitioner(val subjects: Array[String]) extends Partitioner{
  val nameToNum = new mutable.HashMap[String,Int]()
  var i = 0
  for (sub <- subjects){
    nameToNum(sub) = i
    i += 1
  }
  override def numPartitions: Int = subjects.length

  //In the Task in Executors, shuffleWrite will be called before
  override def getPartition(key: Any): Int = {
    val tuple: (String, String) = key.asInstanceOf[(String, String)]
    nameToNum(tuple._1)
  }
}
val partitionedRDD: RDD[((String, String), Int)] = partitioned.mapPartitions(it => {
  it.toList.sortBy(-_._2).take(2).iterator
})

//The above operations can be optimized by using bounded priority queue
val value2: RDD[((String, String), Int)] = partitioned.mapPartitions(
  it => {
    val value1: Ordering[((String, String), Int)] = Ordering[Int].on[((String, String), Int)](-_._2)
    val sorter = new mutable.TreeSet[((String, String), Int)]()
    it.foreach(
      e => {
        sorter.add(e)
        if (sorter.size > 2) {
          sorter -= sorter.last
        }
      }
    )
    sorter.iterator
  }
)
println(value2.collect().toBuffer)

In reduce, the number of shuffle s is reduced by passing in a custom partition

    val sc: SparkConf = new SparkConf().setMaster("local").setAppName("TopTeacher");
    val context = new SparkContext(sc)
    val rdd: RDD[String] = context.textFile("data/teacher.log")
    val mapRdd: RDD[((String, String), Int)] = rdd.map({
      line: String => {
        val strings: Array[String] = line.split("/")
        val teacher: String = strings(3)
        val course: String = strings(2).split("\\.")(0)
        ((course, teacher), 1)
      }
    })
  val subjectPartitioner = new SubjectPartitioner(subjects)
  val reduceRdd: RDD[((String, String), Int)] = mapRdd.reduceByKey(subjectPartitioner, _ + _)
  //The above operation of value can be optimized by using bounded priority queue
  val value2: RDD[((String, String), Int)] = reduceRdd.mapPartitions(
    it => {
      val value1: Ordering[((String, String), Int)] = Ordering[Int].on[((String, String), Int)](-_._2)
      val sorter = new mutable.TreeSet[((String, String), Int)]()
      it.foreach(
        e => {
          sorter.add(e)
          if (sorter.size > 2) {
            sorter -= sorter.last
          }
        }
      )
      sorter.iterator
    }
  )

Topics: Scala Big Data Spark