11 changed files with 404 additions and 338 deletions
--- a/.gitignore
+++ b/.gitignore
@ -14,3 +14,7 @@ target/
 # Dotty IDE
 /.dotty-ide-artifact
 /.dotty-ide.json
+
+# datasets
+stackoverflow-grading.csv
+wikipedia-grading.dat
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -25,7 +25,7 @@ grade:
  tags:
    - cs206
  image:
-    name: smarter3/moocs:reactive-actorbintree-2020-04-15
+    name: smarter3/moocs:bigdata-stackoverflow-2020-05-11-2
    entrypoint: [""]
  allow_failure: true
  before_script:
--- a/assignment.sbt
+++ b/assignment.sbt
@ -1,4 +1,9 @@
 // Student tasks (i.e. submit, packageSubmission)
 enablePlugins(StudentTasks)

-
+courseraId := ch.epfl.lamp.CourseraId(
+  key = "7ByAoS4kEea1yxIfJA1CUw",
+  itemId = "QhzMw",
+  premiumItemId = Some("FWGnz"),
+  partId = "OY5fJ"
+)
--- a/build.sbt
+++ b/build.sbt
@ -1,24 +1,20 @@
-course := "reactive"
-assignment := "actorbintree"
+course := "bigdata"
+assignment := "stackoverflow"

-testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s")
-parallelExecution in Test := false
-
-val akkaVersion = "2.6.0"
-
-scalaVersion := "0.23.0-bin-20200211-5b006fb-NIGHTLY"
-
-scalacOptions ++= Seq(
-  "-feature",
-  "-deprecation",
-  "-encoding", "UTF-8",
-  "-unchecked",
-  "-language:implicitConversions"
+scalaVersion := "0.24.0-RC1"
+scalacOptions ++= Seq("-language:implicitConversions", "-deprecation")
+libraryDependencies ++= Seq(
+  "com.novocode" % "junit-interface" % "0.11" % Test,
+  ("org.apache.spark" %% "spark-core" % "3.0.0-X1").withDottyCompat(scalaVersion.value),
 )

-libraryDependencies ++= Seq(
-  "com.typesafe.akka"        %% "akka-actor"         % akkaVersion,
-  "com.typesafe.akka"        %% "akka-testkit"       % akkaVersion % Test,
-  "com.novocode"             % "junit-interface"     % "0.11"      % Test
-).map(_.withDottyCompat(scalaVersion.value))
-testSuite := "actorbintree.BinaryTreeSuite"
+// Contains Spark 3 snapshot built against 2.13: https://github.com/smarter/spark/tree/scala-2.13
+resolvers += Resolver.bintrayRepo("smarter", "maven")
+
+testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s")
+
+testSuite := "stackoverflow.StackOverflowSuite"
+
+// Without forking, ctrl-c doesn't actually fully stop Spark
+fork in run := true
+fork in Test := true
--- a/grading-tests.jar
+++ b/grading-tests.jar
--- a/src/main/resources/stackoverflow/.keep
+++ b/src/main/resources/stackoverflow/.keep
--- a/src/main/scala/actorbintree/BinaryTreeSet.scala
+++ b/src/main/scala/actorbintree/BinaryTreeSet.scala
@ -1,189 +0,0 @@
-/**
- * Copyright (C) 2009-2013 Typesafe Inc. <http://www.typesafe.com>
- */
-package actorbintree
-
-import akka.actor._
-import scala.collection.immutable.Queue
-
-object BinaryTreeSet {
-
-  trait Operation {
-    def requester: ActorRef
-    def id: Int
-    def elem: Int
-  }
-
-  trait OperationReply {
-    def id: Int
-  }
-
-  /** Request with identifier `id` to insert an element `elem` into the tree.
-    * The actor at reference `requester` should be notified when this operation
-    * is completed.
-    */
-  case class Insert(requester: ActorRef, id: Int, elem: Int) extends Operation
-
-  /** Request with identifier `id` to check whether an element `elem` is present
-    * in the tree. The actor at reference `requester` should be notified when
-    * this operation is completed.
-    */
-  case class Contains(requester: ActorRef, id: Int, elem: Int) extends Operation
-
-  /** Request with identifier `id` to remove the element `elem` from the tree.
-    * The actor at reference `requester` should be notified when this operation
-    * is completed.
-    */
-  case class Remove(requester: ActorRef, id: Int, elem: Int) extends Operation
-
-  /** Request to perform garbage collection */
-  case object GC
-
-  /** Holds the answer to the Contains request with identifier `id`.
-    * `result` is true if and only if the element is present in the tree.
-    */
-  case class ContainsResult(id: Int, result: Boolean) extends OperationReply
-
-  /** Message to signal successful completion of an insert or remove operation. */
-  case class OperationFinished(id: Int) extends OperationReply
-
-}
-
-
-class BinaryTreeSet extends Actor {
-  import BinaryTreeSet._
-  import BinaryTreeNode._
-
-  def createRoot: ActorRef = context.actorOf(BinaryTreeNode.props(0, initiallyRemoved = true))
-
-  var root = createRoot
-
-  // optional
-  var pendingQueue = Queue.empty[Operation]
-
-  // optional
-  def receive = normal
-
-  // optional
-  /** Accepts `Operation` and `GC` messages. */
-  val normal: Receive = {
-    case op:Operation => root ! op
-    case GC => {
-      val newRoot = createRoot;
-      root ! CopyTo(newRoot)
-      context.become(garbageCollecting(newRoot))
-    }
-   }
-
-  // optional
-  /** Handles messages while garbage collection is performed.
-    * `newRoot` is the root of the new binary tree where we want to copy
-    * all non-removed elements into.
-    */
-  def garbageCollecting(newRoot: ActorRef): Receive = {
-    case op:Operation => pendingQueue = pendingQueue.enqueue(op)
-    case CopyFinished =>
-      pendingQueue.foreach(newRoot ! _) //foreach preserves order of a queue (same as dequeueing)
-      root ! PoisonPill //Will also stop all of its children
-      pendingQueue = Queue.empty
-      root = newRoot;
-      context.become(normal)
-    //Ignore GC messages here
-  }
-}
-
-object BinaryTreeNode {
-  trait Position
-
-  case object Left extends Position
-  case object Right extends Position
-
-  case class CopyTo(treeNode: ActorRef)
-  case object CopyFinished
-
-  def props(elem: Int, initiallyRemoved: Boolean) = Props(classOf[BinaryTreeNode],  elem, initiallyRemoved)
-}
-
-class BinaryTreeNode(val elem: Int, initiallyRemoved: Boolean) extends Actor {
-  import BinaryTreeNode._
-  import BinaryTreeSet._
-
-  var subtrees = Map[Position, ActorRef]()
-  var removed = initiallyRemoved
-
-  // optional
-  def receive = normal
-
-  def goDownTo(elem : Int) : Position = if(elem < this.elem) Left else Right
-  // optional
-  /** Handles `Operation` messages and `CopyTo` requests. */
-  val normal: Receive = {
-    case Insert  (requester, id, elem) =>
-      if(elem == this.elem && !removed){
-        requester ! OperationFinished(id)
-      }else{
-        val nextPos = goDownTo(elem)
-        
-        subtrees get nextPos match{
-          case Some(node) => node ! Insert(requester, id, elem)
-          case None => {
-              val newActorSubtree = (nextPos, context.actorOf(BinaryTreeNode.props(elem, false)))
-              subtrees = subtrees + newActorSubtree
-              requester ! OperationFinished(id);
-          }
-        }
-      }
-    case Contains(requester, id, elem) =>
-      if(elem == this.elem && !removed)
-        requester ! ContainsResult(id, true)
-      else{
-        //Need to search subtrees
-        subtrees get goDownTo(elem) match{
-          case Some(node) => node ! Contains(requester, id, elem)
-          case None => requester ! ContainsResult(id, false)
-        }
-      }
-        
-    case Remove  (requester, id, elem) => 
-      if(elem == this.elem && !removed){
-        removed = true
-        requester ! OperationFinished(id)
-      }else{
-        subtrees get goDownTo(elem) match{
-          case Some(node) => node ! Remove(requester, id, elem)
-          case None => requester ! OperationFinished(id) // (elem isn't in the tree)
-        }
-      }
-
-    case CopyTo(newRoot) =>
-      //We are already done, nothing to do
-      if(removed && subtrees.isEmpty) context.parent ! CopyFinished
-      else{
-        if(!removed) newRoot ! Insert(self, elem, elem)
-        subtrees.values foreach(_ ! CopyTo(newRoot)) //Copy subtrees elems
-        //val insertConfirmed = if(removed) true else false, hence we can simply pass removed
-        context.become(copying(subtrees.values.toSet, removed))
-      }
-  }
-
-  // optional
-  /** `expected` is the set of ActorRefs whose replies we are waiting for,
-    * `insertConfirmed` tracks whether the copy of this node to the new tree has been confirmed.
-    */
-  def copying(expected: Set[ActorRef], insertConfirmed: Boolean): Receive = {
-    //To catch the insert of this node into the new tree beeing finished
-    case OperationFinished(_) => {
-      if(expected.isEmpty) context.parent ! CopyFinished
-      else context.become(copying(expected, true))
-    }
-    
-    case CopyFinished => {
-      val newExp = expected-sender
-      if(insertConfirmed && newExp.isEmpty){
-        context.parent ! CopyFinished
-      }else{
-        context.become(copying(newExp, insertConfirmed))
-      }
-    }
-  }
-}
--- a/src/main/scala/stackoverflow/StackOverflow.scala
+++ b/src/main/scala/stackoverflow/StackOverflow.scala
@ -0,0 +1,329 @@
+package stackoverflow
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.apache.log4j.{Logger, Level}
+
+import annotation.tailrec
+import scala.reflect.ClassTag
+import scala.util.Properties.isWin
+
+type Question = Posting
+type Answer = Posting
+type QID = Int
+type HighScore = Int
+type LangIndex = Int
+
+/** A raw stackoverflow posting, either a question or an answer */
+case class Posting(postingType: Int, id: Int, acceptedAnswer: Option[Int], parentId: Option[QID], score: Int, tags: Option[String]) extends Serializable
+
+/** The main class */
+object StackOverflow extends StackOverflow {
+
+  // Reduce Spark logging verbosity
+  Logger.getLogger("org").setLevel(Level.ERROR)
+
+  if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
+
+  @transient lazy val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
+  @transient lazy val sc: SparkContext = new SparkContext(conf)
+
+  /** Main function */
+  def main(args: Array[String]): Unit = {
+
+    val lines   = sc.textFile("src/main/resources/stackoverflow/stackoverflow-grading.csv")
+    val raw     = rawPostings(lines)
+    val grouped = groupedPostings(raw)
+    val scored  = scoredPostings(grouped)
+    val vectors = vectorPostings(scored)
+//    assert(vectors.count() == 2121822, "Incorrect number of vectors: " + vectors.count())
+
+    val means   = kmeans(sampleVectors(vectors), vectors, debug = true)
+    val results = clusterResults(means, vectors)
+    printResults(results)
+  }
+}
+
+/** The parsing and kmeans methods */
+class StackOverflow extends Serializable {
+
+  /** Languages */
+  val langs =
+    List(
+      "JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
+      "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
+
+  /** K-means parameter: How "far apart" languages should be for the kmeans algorithm? */
+  def langSpread = 50000
+  assert(langSpread > 0, "If langSpread is zero we can't recover the language from the input data!")
+
+  /** K-means parameter: Number of clusters */
+  def kmeansKernels = 45
+
+  /** K-means parameter: Convergence criteria */
+  def kmeansEta: Double = 20.0D
+
+  /** K-means parameter: Maximum iterations */
+  def kmeansMaxIterations = 120
+
+
+  //
+  //
+  // Parsing utilities:
+  //
+  //
+
+  /** Load postings from the given file */
+  def rawPostings(lines: RDD[String]): RDD[Posting] =
+    lines.map(line => {
+      val arr = line.split(",")
+      Posting(postingType =    arr(0).toInt,
+              id =             arr(1).toInt,
+              acceptedAnswer = if (arr(2) == "") None else Some(arr(2).toInt),
+              parentId =       if (arr(3) == "") None else Some(arr(3).toInt),
+              score =          arr(4).toInt,
+              tags =           if (arr.length >= 6) Some(arr(5).intern()) else None)
+    })
+
+
+  /** Group the questions and answers together */
+  def groupedPostings(postings: RDD[Posting]): RDD[(QID, Iterable[(Question, Answer)])] = {
+    val questions : RDD[Question] = postings.filter(_.postingType == 1)
+    val answers : RDD[Answer] = postings.filter(_.postingType == 2)
+    val qidQuestionPair : RDD[(QID, Question)] = questions.map(q => (q.id, q))
+    val qidAnswerPair : RDD[(QID, Answer)] = answers.map(a => (a.parentId.get, a))
+
+    val qidQA : RDD[(QID, (Question, Answer))] = qidQuestionPair.join(qidAnswerPair)
+    qidQA.groupByKey
+  }
+
+
+  /** Compute the maximum score for each posting */
+  def scoredPostings(grouped: RDD[(QID, Iterable[(Question, Answer)])]): RDD[(Question, HighScore)] = {
+
+    def answerHighScore(as: Array[Answer]): HighScore = {
+      var highScore = 0
+      var i = 0
+      while (i < as.length) {
+        val score = as(i).score
+        if (score > highScore)
+          highScore = score
+        i += 1
+      }
+      highScore
+    }
+
+    grouped.map{case (_, iter) => 
+      val question = iter.head._1;
+      val answers = iter map { case (q, a) => a }
+      (question, answerHighScore(answers.toArray))
+    }
+  }
+
+
+  /** Compute the vectors for the kmeans */
+  def vectorPostings(scored: RDD[(Question, HighScore)]): RDD[(LangIndex, HighScore)] = {
+    /** Return optional index of first language that occurs in `tags`. */
+    def firstLangInTag(tag: Option[String], ls: List[String]): Option[Int] = {
+      if (tag.isEmpty) None
+      else if (ls.isEmpty) None
+      else if (tag.get == ls.head) Some(0) // index: 0
+      else {
+        val tmp = firstLangInTag(tag, ls.tail)
+        tmp match {
+          case None => None
+          case Some(i) => Some(i + 1) // index i in ls.tail => index i+1
+        }
+      }
+    }
+
+    scored.map{ case (q, hs) => (firstLangInTag(q.tags, langs).get * langSpread, hs)}.persist()
+  }
+
+
+  /** Sample the vectors */
+  def sampleVectors(vectors: RDD[(LangIndex, HighScore)]): Array[(Int, Int)] = {
+
+    assert(kmeansKernels % langs.length == 0, "kmeansKernels should be a multiple of the number of languages studied.")
+    val perLang = kmeansKernels / langs.length
+
+    // http://en.wikipedia.org/wiki/Reservoir_sampling
+    def reservoirSampling(lang: Int, iter: Iterator[Int], size: Int): Array[Int] = {
+      val res = new Array[Int](size)
+      val rnd = new util.Random(lang)
+
+      for (i <- 0 until size) {
+        assert(iter.hasNext, s"iterator must have at least $size elements")
+        res(i) = iter.next
+      }
+
+      var i = size.toLong
+      while (iter.hasNext) {
+        val elt = iter.next
+        val j = math.abs(rnd.nextLong) % i
+        if (j < size)
+          res(j.toInt) = elt
+        i += 1
+      }
+
+      res
+    }
+
+    val res =
+      if (langSpread < 500)
+        // sample the space regardless of the language
+        vectors.takeSample(false, kmeansKernels, 42)
+      else
+        // sample the space uniformly from each language partition
+        vectors.groupByKey.flatMap({
+          case (lang, vectors) => reservoirSampling(lang, vectors.iterator, perLang).map((lang, _))
+        }).collect()
+
+    assert(res.length == kmeansKernels, res.length)
+    res
+  }
+
+
+  //
+  //
+  //  Kmeans method:
+  //
+  //
+
+  /** Main kmeans computation */
+  @tailrec final def kmeans(means: Array[(Int, Int)], vectors: RDD[(Int, Int)], iter: Int = 1, debug: Boolean = false): Array[(Int, Int)] = {
+    val newMeans = means.clone() // you need to compute newMeans
+
+    // TODO: Fill in the newMeans array
+    val distance = euclideanDistance(means, newMeans)
+
+    if (debug) {
+      println(s"""Iteration: $iter
+                 |  * current distance: $distance
+                 |  * desired distance: $kmeansEta
+                 |  * means:""".stripMargin)
+      for (idx <- 0 until kmeansKernels)
+      println(f"   ${means(idx).toString}%20s ==> ${newMeans(idx).toString}%20s  " +
+              f"  distance: ${euclideanDistance(means(idx), newMeans(idx))}%8.0f")
+    }
+
+    if (converged(distance))
+      newMeans
+    else if (iter < kmeansMaxIterations)
+      kmeans(newMeans, vectors, iter + 1, debug)
+    else {
+      if (debug) {
+        println("Reached max iterations!")
+      }
+      newMeans
+    }
+  }
+
+
+
+
+  //
+  //
+  //  Kmeans utilities:
+  //
+  //
+
+  /** Decide whether the kmeans clustering converged */
+  def converged(distance: Double) =
+    distance < kmeansEta
+
+
+  /** Return the euclidean distance between two points */
+  def euclideanDistance(v1: (Int, Int), v2: (Int, Int)): Double = {
+    val part1 = (v1._1 - v2._1).toDouble * (v1._1 - v2._1)
+    val part2 = (v1._2 - v2._2).toDouble * (v1._2 - v2._2)
+    part1 + part2
+  }
+
+  /** Return the euclidean distance between two points */
+  def euclideanDistance(a1: Array[(Int, Int)], a2: Array[(Int, Int)]): Double = {
+    assert(a1.length == a2.length)
+    var sum = 0d
+    var idx = 0
+    while(idx < a1.length) {
+      sum += euclideanDistance(a1(idx), a2(idx))
+      idx += 1
+    }
+    sum
+  }
+
+  /** Return the closest point */
+  def findClosest(p: (Int, Int), centers: Array[(Int, Int)]): Int = {
+    var bestIndex = 0
+    var closest = Double.PositiveInfinity
+    for (i <- 0 until centers.length) {
+      val tempDist = euclideanDistance(p, centers(i))
+      if (tempDist < closest) {
+        closest = tempDist
+        bestIndex = i
+      }
+    }
+    bestIndex
+  }
+
+
+  /** Average the vectors */
+  def averageVectors(ps: Iterable[(Int, Int)]): (Int, Int) = {
+    val iter = ps.iterator
+    var count = 0
+    var comp1: Long = 0
+    var comp2: Long = 0
+    while (iter.hasNext) {
+      val item = iter.next
+      comp1 += item._1
+      comp2 += item._2
+      count += 1
+    }
+    ((comp1 / count).toInt, (comp2 / count).toInt)
+  }
+
+
+
+
+  //
+  //
+  //  Displaying results:
+  //
+  //
+  def clusterResults(means: Array[(Int, Int)], vectors: RDD[(LangIndex, HighScore)]): Array[(String, Double, Int, Int)] = {
+    val closest = vectors.map(p => (findClosest(p, means), p))
+    val closestGrouped = closest.groupByKey()
+
+    val median = closestGrouped.mapValues { vs =>
+      val dominantLangIndex = vs.groupBy(_._1).maxBy(_._2.size)._1 / langSpread
+      val langLabel: String   = langs(dominantLangIndex)// most common language in the cluster
+      
+      val nbOfDominant = vs.count(_._1 / langSpread == dominantLangIndex)
+      val langPercent: Double = 100.0 * (nbOfDominant.toDouble / vs.size) // percent of the questions in the most common language
+      val clusterSize: Int    = vs.size
+      val medianScore: Int    = {
+        val highscores = vs.map{ case (li, hs) => hs }
+        val sortedHighscores = highscores.toArray.sorted;
+        val mid = highscores.size / 2;
+        highscores.size % 2 match{
+          case 1 => sortedHighscores(mid)
+          case 0 => (sortedHighscores(mid-1) + sortedHighscores(mid)) / 2
+        }
+      }
+
+      (langLabel, langPercent, clusterSize, medianScore)
+    }
+
+    median.collect().map(_._2).sortBy(_._4)
+  }
+
+  def printResults(results: Array[(String, Double, Int, Int)]): Unit = {
+    println("Resulting clusters:")
+    println("  Score  Dominant language (%percent)  Questions")
+    println("================================================")
+    for ((lang, percent, size, score) <- results)
+      println(f"${score}%7d  ${lang}%-17s (${percent}%-5.1f%%)      ${size}%7d")
+  }
+}
--- a/src/test/scala/actorbintree/BinaryTreeSuite.scala
+++ b/src/test/scala/actorbintree/BinaryTreeSuite.scala
@ -1,126 +0,0 @@
-/**
- * Copyright (C) 2009-2015 Typesafe Inc. <http://www.typesafe.com>
- */
-package actorbintree
-
-import akka.actor.{ActorRef, ActorSystem, Props, actorRef2Scala, scala2ActorRef}
-import akka.testkit.{ImplicitSender, TestKit, TestProbe}
-import org.junit.Test
-import org.junit.Assert._
-
-import scala.util.Random
-import scala.concurrent.duration._
-
-class BinaryTreeSuite extends TestKit(ActorSystem("BinaryTreeSuite")) with ImplicitSender {
-
-  import actorbintree.BinaryTreeSet._
-
-  def receiveN(requester: TestProbe, ops: Seq[Operation], expectedReplies: Seq[OperationReply]): Unit =
-    requester.within(5.seconds) {
-      val repliesUnsorted = for (i <- 1 to ops.size) yield try {
-        requester.expectMsgType[OperationReply]
-      } catch {
-        case ex: Throwable if ops.size > 10 => sys.error(s"failure to receive confirmation $i/${ops.size}\n$ex")
-        case ex: Throwable                  => sys.error(s"failure to receive confirmation $i/${ops.size}\nRequests:" + ops.mkString("\n    ", "\n     ", "") + s"\n$ex")
-      }
-      val replies = repliesUnsorted.sortBy(_.id)
-      if (replies != expectedReplies) {
-        val pairs = (replies zip expectedReplies).zipWithIndex filter (x => x._1._1 != x._1._2)
-        fail("unexpected replies:" + pairs.map(x => s"at index ${x._2}: got ${x._1._1}, expected ${x._1._2}").mkString("\n    ", "\n    ", ""))
-      }
-    }
-
-  def verify(probe: TestProbe, ops: Seq[Operation], expected: Seq[OperationReply]): Unit = {
-    val topNode = system.actorOf(Props[BinaryTreeSet])
-
-    ops foreach { op =>
-      topNode ! op
-    }
-
-    receiveN(probe, ops, expected)
-    // the grader also verifies that enough actors are created
-  }
-
-  @Test def `proper inserts and lookups (5pts)`(): Unit = {
-    val topNode = system.actorOf(Props[BinaryTreeSet])
-
-    topNode ! Contains(testActor, id = 1, 1)
-    expectMsg(ContainsResult(1, false))
-
-    topNode ! Insert(testActor, id = 2, 1)
-    topNode ! Contains(testActor, id = 3, 1)
-
-    expectMsg(OperationFinished(2))
-    expectMsg(ContainsResult(3, true))
-    ()
-  }
-
-  @Test def `instruction example (5pts)`(): Unit = {
-    val requester = TestProbe()
-    val requesterRef = requester.ref
-    val ops = List(
-      Insert(requesterRef, id=100, 1),
-      Contains(requesterRef, id=50, 2),
-      Remove(requesterRef, id=10, 1),
-      Insert(requesterRef, id=20, 2),
-      Contains(requesterRef, id=80, 1),
-      Contains(requesterRef, id=70, 2)
-    )
-
-    val expectedReplies = List(
-      OperationFinished(id=10),
-      OperationFinished(id=20),
-      ContainsResult(id=50, false),
-      ContainsResult(id=70, true),
-      ContainsResult(id=80, false),
-      OperationFinished(id=100)
-    )
-
-    verify(requester, ops, expectedReplies)
-  }
-
-
-  @Test def `behave identically to built-in set (includes GC) (40pts)`(): Unit = {
-    val rnd = new Random()
-    def randomOperations(requester: ActorRef, count: Int): Seq[Operation] = {
-      def randomElement: Int = rnd.nextInt(100)
-      def randomOperation(requester: ActorRef, id: Int): Operation = rnd.nextInt(4) match {
-        case 0 => Insert(requester, id, randomElement)
-        case 1 => Insert(requester, id, randomElement)
-        case 2 => Contains(requester, id, randomElement)
-        case 3 => Remove(requester, id, randomElement)
-      }
-
-      for (seq <- 0 until count) yield randomOperation(requester, seq)
-    }
-
-    def referenceReplies(operations: Seq[Operation]): Seq[OperationReply] = {
-      var referenceSet = Set.empty[Int]
-      def replyFor(op: Operation): OperationReply = op match {
-        case Insert(_, seq, elem) =>
-          referenceSet = referenceSet + elem
-          OperationFinished(seq)
-        case Remove(_, seq, elem) =>
-          referenceSet = referenceSet - elem
-          OperationFinished(seq)
-        case Contains(_, seq, elem) =>
-          ContainsResult(seq, referenceSet(elem))
-      }
-
-      for (op <- operations) yield replyFor(op)
-    }
-
-    val requester = TestProbe()
-    val topNode = system.actorOf(Props[BinaryTreeSet])
-    val count = 1000
-
-    val ops = randomOperations(requester.ref, count)
-    val expectedReplies = referenceReplies(ops)
-
-    ops foreach { op =>
-      topNode ! op
-      if (rnd.nextDouble() < 0.1) topNode ! GC
-    }
-    receiveN(requester, ops, expectedReplies)
-  }
-}
--- a/src/test/scala/stackoverflow/StackOverflowSuite.scala
+++ b/src/test/scala/stackoverflow/StackOverflowSuite.scala
@ -0,0 +1,47 @@
+package stackoverflow
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.junit._
+import org.junit.Assert.assertEquals
+import java.io.File
+import scala.io.{ Codec, Source }
+import scala.util.Properties.isWin
+
+object StackOverflowSuite {
+  if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
+
+  val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
+  val sc: SparkContext = new SparkContext(conf)
+}
+
+class StackOverflowSuite {
+  import StackOverflowSuite._
+
+
+  lazy val testObject = new StackOverflow {
+    override val langs =
+      List(
+        "JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
+        "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
+    override def langSpread = 50000
+    override def kmeansKernels = 45
+    override def kmeansEta: Double = 20.0D
+    override def kmeansMaxIterations = 120
+  }
+
+  @Test def `testObject can be instantiated`: Unit = {
+    val instantiatable = try {
+      testObject
+      true
+    } catch {
+      case _: Throwable => false
+    }
+    assert(instantiatable, "Can't instantiate a StackOverflow object")
+  }
+
+
+  @Rule def individualTestTimeout = new org.junit.rules.Timeout(300 * 1000)
+}
--- a/winutils/hadoop-2.7.4/bin/winutils.exe
+++ b/winutils/hadoop-2.7.4/bin/winutils.exe