12 changed files with 326 additions and 338 deletions
--- a/.gitignore
+++ b/.gitignore
@ -14,3 +14,7 @@ target/
 # Dotty IDE
 /.dotty-ide-artifact
 /.dotty-ide.json
 # datasets
 stackoverflow-grading.csv
 wikipedia-grading.dat
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -25,7 +25,7 @@ grade:
  tags:
    - cs206
  image:
-    name: smarter3/moocs:reactive-actorbintree-2020-04-15
+    name: smarter3/moocs:bigdata-wikipedia-2020-04-30-3
    entrypoint: [""]
  allow_failure: true
  before_script:
--- a/assignment.sbt
+++ b/assignment.sbt
@ -1,4 +1,9 @@
 // Student tasks (i.e. submit, packageSubmission)
 enablePlugins(StudentTasks)
-
+courseraId := ch.epfl.lamp.CourseraId(
  key = "EH8wby4kEeawURILfHIqjw",
  itemId = "CfQX2",
  premiumItemId = Some("QcWcs"),
  partId = "5komc"
 )
--- a/build.sbt
+++ b/build.sbt
@ -1,24 +1,20 @@
-course := "reactive"
+course := "bigdata"
-assignment := "actorbintree"
+assignment := "wikipedia"
-testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s")
+scalaVersion := "0.24.0-RC1"
-parallelExecution in Test := false
+scalacOptions ++= Seq("-language:implicitConversions", "-deprecation")
-
+libraryDependencies ++= Seq(
-val akkaVersion = "2.6.0"
+  "com.novocode" % "junit-interface" % "0.11" % Test,
-
+  ("org.apache.spark" %% "spark-core" % "3.0.0-X1").withDottyCompat(scalaVersion.value),
 scalaVersion := "0.23.0-bin-20200211-5b006fb-NIGHTLY"
 scalacOptions ++= Seq(
  "-feature",
  "-deprecation",
  "-encoding", "UTF-8",
  "-unchecked",
  "-language:implicitConversions"
 )
-libraryDependencies ++= Seq(
+// Contains Spark 3 snapshot built against 2.13: https://github.com/smarter/spark/tree/scala-2.13
-  "com.typesafe.akka"        %% "akka-actor"         % akkaVersion,
+resolvers += Resolver.bintrayRepo("smarter", "maven")
-  "com.typesafe.akka"        %% "akka-testkit"       % akkaVersion % Test,
+
-  "com.novocode"             % "junit-interface"     % "0.11"      % Test
+testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s")
-).map(_.withDottyCompat(scalaVersion.value))
+
-testSuite := "actorbintree.BinaryTreeSuite"
+testSuite := "wikipedia.WikipediaSuite"
 // Without forking, ctrl-c doesn't actually fully stop Spark
 fork in run := true
 fork in Test := true
--- a/grading-tests.jar
+++ b/grading-tests.jar
--- a/src/main/resources/wikipedia/.keep
+++ b/src/main/resources/wikipedia/.keep
--- a/src/main/scala/actorbintree/BinaryTreeSet.scala
+++ b/src/main/scala/actorbintree/BinaryTreeSet.scala
@ -1,189 +0,0 @@
 /**
 * Copyright (C) 2009-2013 Typesafe Inc. <http://www.typesafe.com>
 */
 package actorbintree
 import akka.actor._
 import scala.collection.immutable.Queue
 object BinaryTreeSet {
  trait Operation {
    def requester: ActorRef
    def id: Int
    def elem: Int
  }
  trait OperationReply {
    def id: Int
  }
  /** Request with identifier `id` to insert an element `elem` into the tree.
    * The actor at reference `requester` should be notified when this operation
    * is completed.
    */
  case class Insert(requester: ActorRef, id: Int, elem: Int) extends Operation
  /** Request with identifier `id` to check whether an element `elem` is present
    * in the tree. The actor at reference `requester` should be notified when
    * this operation is completed.
    */
  case class Contains(requester: ActorRef, id: Int, elem: Int) extends Operation
  /** Request with identifier `id` to remove the element `elem` from the tree.
    * The actor at reference `requester` should be notified when this operation
    * is completed.
    */
  case class Remove(requester: ActorRef, id: Int, elem: Int) extends Operation
  /** Request to perform garbage collection */
  case object GC
  /** Holds the answer to the Contains request with identifier `id`.
    * `result` is true if and only if the element is present in the tree.
    */
  case class ContainsResult(id: Int, result: Boolean) extends OperationReply
  /** Message to signal successful completion of an insert or remove operation. */
  case class OperationFinished(id: Int) extends OperationReply
 }
 class BinaryTreeSet extends Actor {
  import BinaryTreeSet._
  import BinaryTreeNode._
  def createRoot: ActorRef = context.actorOf(BinaryTreeNode.props(0, initiallyRemoved = true))
  var root = createRoot
  // optional
  var pendingQueue = Queue.empty[Operation]
  // optional
  def receive = normal
  // optional
  /** Accepts `Operation` and `GC` messages. */
  val normal: Receive = {
    case op:Operation => root ! op
    case GC => {
      val newRoot = createRoot;
      root ! CopyTo(newRoot)
      context.become(garbageCollecting(newRoot))
    }
   }
  // optional
  /** Handles messages while garbage collection is performed.
    * `newRoot` is the root of the new binary tree where we want to copy
    * all non-removed elements into.
    */
  def garbageCollecting(newRoot: ActorRef): Receive = {
    case op:Operation => pendingQueue = pendingQueue.enqueue(op)
    case CopyFinished =>
      pendingQueue.foreach(newRoot ! _) //foreach preserves order of a queue (same as dequeueing)
      root ! PoisonPill //Will also stop all of its children
      pendingQueue = Queue.empty
      root = newRoot;
      context.become(normal)
    //Ignore GC messages here
  }
 }
 object BinaryTreeNode {
  trait Position
  case object Left extends Position
  case object Right extends Position
  case class CopyTo(treeNode: ActorRef)
  case object CopyFinished
  def props(elem: Int, initiallyRemoved: Boolean) = Props(classOf[BinaryTreeNode],  elem, initiallyRemoved)
 }
 class BinaryTreeNode(val elem: Int, initiallyRemoved: Boolean) extends Actor {
  import BinaryTreeNode._
  import BinaryTreeSet._
  var subtrees = Map[Position, ActorRef]()
  var removed = initiallyRemoved
  // optional
  def receive = normal
  def goDownTo(elem : Int) : Position = if(elem < this.elem) Left else Right
  // optional
  /** Handles `Operation` messages and `CopyTo` requests. */
  val normal: Receive = {
    case Insert  (requester, id, elem) =>
      if(elem == this.elem && !removed){
        requester ! OperationFinished(id)
      }else{
        val nextPos = goDownTo(elem)
        subtrees get nextPos match{
          case Some(node) => node ! Insert(requester, id, elem)
          case None => {
              val newActorSubtree = (nextPos, context.actorOf(BinaryTreeNode.props(elem, false)))
              subtrees = subtrees + newActorSubtree
              requester ! OperationFinished(id);
          }
        }
      }
    case Contains(requester, id, elem) =>
      if(elem == this.elem && !removed)
        requester ! ContainsResult(id, true)
      else{
        //Need to search subtrees
        subtrees get goDownTo(elem) match{
          case Some(node) => node ! Contains(requester, id, elem)
          case None => requester ! ContainsResult(id, false)
        }
      }
    case Remove  (requester, id, elem) => 
      if(elem == this.elem && !removed){
        removed = true
        requester ! OperationFinished(id)
      }else{
        subtrees get goDownTo(elem) match{
          case Some(node) => node ! Remove(requester, id, elem)
          case None => requester ! OperationFinished(id) // (elem isn't in the tree)
        }
      }
    case CopyTo(newRoot) =>
      //We are already done, nothing to do
      if(removed && subtrees.isEmpty) context.parent ! CopyFinished
      else{
        if(!removed) newRoot ! Insert(self, elem, elem)
        subtrees.values foreach(_ ! CopyTo(newRoot)) //Copy subtrees elems
        //val insertConfirmed = if(removed) true else false, hence we can simply pass removed
        context.become(copying(subtrees.values.toSet, removed))
      }
  }
  // optional
  /** `expected` is the set of ActorRefs whose replies we are waiting for,
    * `insertConfirmed` tracks whether the copy of this node to the new tree has been confirmed.
    */
  def copying(expected: Set[ActorRef], insertConfirmed: Boolean): Receive = {
    //To catch the insert of this node into the new tree beeing finished
    case OperationFinished(_) => {
      if(expected.isEmpty) context.parent ! CopyFinished
      else context.become(copying(expected, true))
    }
    case CopyFinished => {
      val newExp = expected-sender
      if(insertConfirmed && newExp.isEmpty){
        context.parent ! CopyFinished
      }else{
        context.become(copying(newExp, insertConfirmed))
      }
    }
  }
 }
--- a/src/main/scala/wikipedia/WikipediaData.scala
+++ b/src/main/scala/wikipedia/WikipediaData.scala
@ -0,0 +1,21 @@
 package wikipedia
 import scala.io.Source
 object WikipediaData {
  private[wikipedia] def lines: List[String] = {
    Option(getClass.getResourceAsStream("/wikipedia/wikipedia-grading.dat")) match {
      case None => sys.error("Please download the dataset as explained in the assignment instructions")
      case Some(resource) => Source.fromInputStream(resource).getLines().toList
    }
  }
  private[wikipedia] def parse(line: String): WikipediaArticle = {
    val subs = "</title><text>"
    val i = line.indexOf(subs)
    val title = line.substring(14, i)
    val text  = line.substring(i + subs.length, line.length-16)
    WikipediaArticle(title, text)
  }
 }
--- a/src/main/scala/wikipedia/WikipediaRanking.scala
+++ b/src/main/scala/wikipedia/WikipediaRanking.scala
@ -0,0 +1,115 @@
 package wikipedia
 import org.apache.spark.SparkConf
 import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.apache.log4j.{Logger, Level}
 import org.apache.spark.rdd.RDD
 case class WikipediaArticle(title: String, text: String) {
  /**
    * @return Whether the text of this article mentions `lang` or not
    * @param lang Language to look for (e.g. "Scala")
    */
  def mentionsLanguage(lang: String): Boolean = text.split(' ').contains(lang)
 }
 object WikipediaRanking extends WikipediaRankingInterface {
  // Reduce Spark logging verbosity
  Logger.getLogger("org").setLevel(Level.ERROR)
  val langs = List(
    "JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
    "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
  val conf: SparkConf = new SparkConf().setMaster("local").setAppName("My wiki app")
  val sc: SparkContext = new SparkContext(conf)
  // Hint: use a combination of `sc.parallelize`, `WikipediaData.lines` and `WikipediaData.parse`
  val wikiRdd: RDD[WikipediaArticle] = sc.parallelize(WikipediaData.lines).map(WikipediaData.parse);
  /** Returns the number of articles on which the language `lang` occurs.
   *  Hint1: consider using method `aggregate` on RDD[T].
   *  Hint2: consider using method `mentionsLanguage` on `WikipediaArticle`
   */
  def occurrencesOfLang(lang: String, rdd: RDD[WikipediaArticle]): Int = {
    def seqOp(acc : Int, art : WikipediaArticle): Int = {
      val x = if(art.mentionsLanguage(lang)) 1 else 0
      acc + x
    }
    rdd.aggregate(0 : Int)(seqOp, _ + _);
  }
  /* (1) Use `occurrencesOfLang` to compute the ranking of the languages
   *     (`val langs`) by determining the number of Wikipedia articles that
   *     mention each language at least once. Don't forget to sort the
   *     languages by their occurrence, in decreasing order!
   *
   *   Note: this operation is long-running. It can potentially run for
   *   several seconds.
   */
  def rankLangs(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] = 
    langs.map((lang : String) => (lang, occurrencesOfLang(lang, rdd))).sortBy(_._2).reverse;
  /* Compute an inverted index of the set of articles, mapping each language
   * to the Wikipedia pages in which it occurs.
   */
  def makeIndex(langs: List[String], rdd: RDD[WikipediaArticle]): RDD[(String, Iterable[WikipediaArticle])] = {
    (for{
      art <- rdd
      lang <- langs if art.mentionsLanguage(lang)
    }yield{(lang, art)}).groupByKey
  }
  /* (2) Compute the language ranking again, but now using the inverted index. Can you notice
   *     a performance improvement?
   *
   *   Note: this operation is long-running. It can potentially run for
   *   several seconds.
   */
  def rankLangsUsingIndex(index: RDD[(String, Iterable[WikipediaArticle])]): List[(String, Int)] = 
    index.mapValues(_.size).collect.sortBy(_._2).reverse.toList
  /* (3) Use `reduceByKey` so that the computation of the index and the ranking are combined.
   *     Can you notice an improvement in performance compared to measuring *both* the computation of the index
   *     and the computation of the ranking? If so, can you think of a reason?
   *
   *   Note: this operation is long-running. It can potentially run for
   *   several seconds.
   */
  def rankLangsReduceByKey(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] = {
    val reducedRDD = (for{
      art <- rdd
      lang <- langs if art.mentionsLanguage(lang)
    }yield{(lang, 1)}).reduceByKey(_ + _)
    reducedRDD.collect.sortBy(_._2).reverse.toList
  }
  def main(args: Array[String]): Unit = {
    /* Languages ranked according to (1) */
    val langsRanked: List[(String, Int)] = timed("Part 1: naive ranking", rankLangs(langs, wikiRdd))
    /* An inverted index mapping languages to wikipedia pages on which they appear */
    def index: RDD[(String, Iterable[WikipediaArticle])] = makeIndex(langs, wikiRdd)
    /* Languages ranked according to (2), using the inverted index */
    val langsRanked2: List[(String, Int)] = timed("Part 2: ranking using inverted index", rankLangsUsingIndex(index))
    /* Languages ranked according to (3) */
    val langsRanked3: List[(String, Int)] = timed("Part 3: ranking using reduceByKey", rankLangsReduceByKey(langs, wikiRdd))
    /* Output the speed of each ranking */
    println(timing)
    sc.stop()
  }
  val timing = new StringBuffer
  def timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    result
  }
 }
--- a/src/main/scala/wikipedia/WikipediaRankingInterface.scala
+++ b/src/main/scala/wikipedia/WikipediaRankingInterface.scala
@ -0,0 +1,20 @@
 package wikipedia
 import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
 /**
 * The interface used by the grading infrastructure. Do not change signatures
 * or your submission will fail with a NoSuchMethodError.
 */
 trait WikipediaRankingInterface {
  def makeIndex(langs: List[String], rdd: RDD[WikipediaArticle]): RDD[(String, Iterable[WikipediaArticle])]
  def occurrencesOfLang(lang: String, rdd: RDD[WikipediaArticle]): Int
  def rankLangs(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)]
  def rankLangsReduceByKey(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)]
  def rankLangsUsingIndex(index: RDD[(String, Iterable[WikipediaArticle])]): List[(String, Int)]
  def langs: List[String]
  def sc: SparkContext
  def wikiRdd: RDD[WikipediaArticle]
 }
--- a/src/test/scala/actorbintree/BinaryTreeSuite.scala
+++ b/src/test/scala/actorbintree/BinaryTreeSuite.scala
@ -1,126 +0,0 @@
 /**
 * Copyright (C) 2009-2015 Typesafe Inc. <http://www.typesafe.com>
 */
 package actorbintree
 import akka.actor.{ActorRef, ActorSystem, Props, actorRef2Scala, scala2ActorRef}
 import akka.testkit.{ImplicitSender, TestKit, TestProbe}
 import org.junit.Test
 import org.junit.Assert._
 import scala.util.Random
 import scala.concurrent.duration._
 class BinaryTreeSuite extends TestKit(ActorSystem("BinaryTreeSuite")) with ImplicitSender {
  import actorbintree.BinaryTreeSet._
  def receiveN(requester: TestProbe, ops: Seq[Operation], expectedReplies: Seq[OperationReply]): Unit =
    requester.within(5.seconds) {
      val repliesUnsorted = for (i <- 1 to ops.size) yield try {
        requester.expectMsgType[OperationReply]
      } catch {
        case ex: Throwable if ops.size > 10 => sys.error(s"failure to receive confirmation $i/${ops.size}\n$ex")
        case ex: Throwable                  => sys.error(s"failure to receive confirmation $i/${ops.size}\nRequests:" + ops.mkString("\n    ", "\n     ", "") + s"\n$ex")
      }
      val replies = repliesUnsorted.sortBy(_.id)
      if (replies != expectedReplies) {
        val pairs = (replies zip expectedReplies).zipWithIndex filter (x => x._1._1 != x._1._2)
        fail("unexpected replies:" + pairs.map(x => s"at index ${x._2}: got ${x._1._1}, expected ${x._1._2}").mkString("\n    ", "\n    ", ""))
      }
    }
  def verify(probe: TestProbe, ops: Seq[Operation], expected: Seq[OperationReply]): Unit = {
    val topNode = system.actorOf(Props[BinaryTreeSet])
    ops foreach { op =>
      topNode ! op
    }
    receiveN(probe, ops, expected)
    // the grader also verifies that enough actors are created
  }
  @Test def `proper inserts and lookups (5pts)`(): Unit = {
    val topNode = system.actorOf(Props[BinaryTreeSet])
    topNode ! Contains(testActor, id = 1, 1)
    expectMsg(ContainsResult(1, false))
    topNode ! Insert(testActor, id = 2, 1)
    topNode ! Contains(testActor, id = 3, 1)
    expectMsg(OperationFinished(2))
    expectMsg(ContainsResult(3, true))
    ()
  }
  @Test def `instruction example (5pts)`(): Unit = {
    val requester = TestProbe()
    val requesterRef = requester.ref
    val ops = List(
      Insert(requesterRef, id=100, 1),
      Contains(requesterRef, id=50, 2),
      Remove(requesterRef, id=10, 1),
      Insert(requesterRef, id=20, 2),
      Contains(requesterRef, id=80, 1),
      Contains(requesterRef, id=70, 2)
    )
    val expectedReplies = List(
      OperationFinished(id=10),
      OperationFinished(id=20),
      ContainsResult(id=50, false),
      ContainsResult(id=70, true),
      ContainsResult(id=80, false),
      OperationFinished(id=100)
    )
    verify(requester, ops, expectedReplies)
  }
  @Test def `behave identically to built-in set (includes GC) (40pts)`(): Unit = {
    val rnd = new Random()
    def randomOperations(requester: ActorRef, count: Int): Seq[Operation] = {
      def randomElement: Int = rnd.nextInt(100)
      def randomOperation(requester: ActorRef, id: Int): Operation = rnd.nextInt(4) match {
        case 0 => Insert(requester, id, randomElement)
        case 1 => Insert(requester, id, randomElement)
        case 2 => Contains(requester, id, randomElement)
        case 3 => Remove(requester, id, randomElement)
      }
      for (seq <- 0 until count) yield randomOperation(requester, seq)
    }
    def referenceReplies(operations: Seq[Operation]): Seq[OperationReply] = {
      var referenceSet = Set.empty[Int]
      def replyFor(op: Operation): OperationReply = op match {
        case Insert(_, seq, elem) =>
          referenceSet = referenceSet + elem
          OperationFinished(seq)
        case Remove(_, seq, elem) =>
          referenceSet = referenceSet - elem
          OperationFinished(seq)
        case Contains(_, seq, elem) =>
          ContainsResult(seq, referenceSet(elem))
      }
      for (op <- operations) yield replyFor(op)
    }
    val requester = TestProbe()
    val topNode = system.actorOf(Props[BinaryTreeSet])
    val count = 1000
    val ops = randomOperations(requester.ref, count)
    val expectedReplies = referenceReplies(ops)
    ops foreach { op =>
      topNode ! op
      if (rnd.nextDouble() < 0.1) topNode ! GC
    }
    receiveN(requester, ops, expectedReplies)
  }
 }
--- a/src/test/scala/wikipedia/WikipediaSuite.scala
+++ b/src/test/scala/wikipedia/WikipediaSuite.scala
@ -0,0 +1,142 @@
 package wikipedia
 import org.apache.spark.SparkConf
 import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.junit._
 class WikipediaSuite {
  def initializeWikipediaRanking(): Boolean =
    try {
      WikipediaRanking
      true
    } catch {
      case ex: Throwable =>
        println(ex.getMessage)
        ex.printStackTrace()
        false
    }
  import WikipediaRanking._
  /**
    * Creates a truncated string representation of a list, adding ", ...)" if there
    * are too many elements to show
    * @param l The list to preview
    * @param n The number of elements to cut it at
    * @return A preview of the list, containing at most n elements.
    */
  def previewList[A](l: List[A], n: Int = 10): String =
    if (l.length <= n) l.toString
    else l.take(n).toString.dropRight(1) + ", ...)"
  /**
    * Asserts that all the elements in a given list and an expected list are the same,
    * regardless of order. For a prettier output, given and expected should be sorted
    * with the same ordering.
    * @param actual The actual list
    * @param expected The expected list
    * @tparam A Type of the list elements
    */
  def assertSameElements[A](actual: List[A], expected: List[A]): Unit = {
    val givenSet = actual.toSet
    val expectedSet = expected.toSet
    val unexpected = givenSet -- expectedSet
    val missing = expectedSet -- givenSet
    val noUnexpectedElements = unexpected.isEmpty
    val noMissingElements = missing.isEmpty
    val noMatchString =
      s"""
         |Expected: ${previewList(expected)}
         |Actual:   ${previewList(actual)}""".stripMargin
    assert(noUnexpectedElements,
      s"""|$noMatchString
          |The given collection contains some unexpected elements: ${previewList(unexpected.toList, 5)}""".stripMargin)
    assert(noMissingElements,
      s"""|$noMatchString
          |The given collection is missing some expected elements: ${previewList(missing.toList, 5)}""".stripMargin)
  }
  // Conditions:
  // (1) the language stats contain the same elements
  // (2) they are ordered (and the order doesn't matter if there are several languages with the same count)
  def assertEquivalentAndOrdered(actual: List[(String, Int)], expected: List[(String, Int)]): Unit = {
    // (1)
    assertSameElements(actual, expected)
    // (2)
    assert(
      !(actual zip actual.tail).exists({ case ((_, occ1), (_, occ2)) => occ1 < occ2 }),
      "The given elements are not in descending order"
    )
  }
  @Test def `'occurrencesOfLang' should work for (specific) RDD with one element`: Unit = {
    assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?")
    val rdd = sc.parallelize(Seq(WikipediaArticle("title", "Java Jakarta")))
    val res = (occurrencesOfLang("Java", rdd) == 1)
    assert(res, "occurrencesOfLang given (specific) RDD with one element should equal to 1")
  }
  @Test def `'rankLangs' should work for RDD with two elements`: Unit = {
    assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?")
    val langs = List("Scala", "Java")
    val rdd = sc.parallelize(List(WikipediaArticle("1", "Scala is great"), WikipediaArticle("2", "Java is OK, but Scala is cooler")))
    val ranked = rankLangs(langs, rdd)
    val res = ranked.head._1 == "Scala"
    assert(res)
  }
  @Test def `'makeIndex' creates a simple index with two entries`: Unit = {
    assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?")
    val langs = List("Scala", "Java")
    val articles = List(
        WikipediaArticle("1","Groovy is pretty interesting, and so is Erlang"),
        WikipediaArticle("2","Scala and Java run on the JVM"),
        WikipediaArticle("3","Scala is not purely functional")
      )
    val rdd = sc.parallelize(articles)
    val index = makeIndex(langs, rdd)
    val res = index.count() == 2
    assert(res)
  }
  @Test def `'rankLangsUsingIndex' should work for a simple RDD with three elements`: Unit = {
    assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?")
    val langs = List("Scala", "Java")
    val articles = List(
        WikipediaArticle("1","Groovy is pretty interesting, and so is Erlang"),
        WikipediaArticle("2","Scala and Java run on the JVM"),
        WikipediaArticle("3","Scala is not purely functional")
      )
    val rdd = sc.parallelize(articles)
    val index = makeIndex(langs, rdd)
    val ranked = rankLangsUsingIndex(index)
    val res = (ranked.head._1 == "Scala")
    assert(res)
  }
  @Test def `'rankLangsReduceByKey' should work for a simple RDD with five elements`: Unit = {
    assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?")
    val langs = List("Scala", "Java", "Groovy", "Haskell", "Erlang")
    val articles = List(
        WikipediaArticle("1","Groovy is pretty interesting, and so is Erlang"),
        WikipediaArticle("2","Scala and Java run on the JVM"),
        WikipediaArticle("3","Scala is not purely functional"),
        WikipediaArticle("4","The cool kids like Haskell more than Java"),
        WikipediaArticle("5","Java is for enterprise developers")
      )
    val rdd = sc.parallelize(articles)
    val ranked = rankLangsReduceByKey(langs, rdd)
    val res = (ranked.head._1 == "Java")
    assert(res)
  }
  @Rule def individualTestTimeout = new org.junit.rules.Timeout(100 * 1000)
 }