Add stackoverflow assignment

2019-02-19 20:44:23 +01:00 · 2019-02-19 20:44:23 +01:00 · 0a94b65038
commit 0a94b65038
15 changed files with 820 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,20 @@
+# General
+*.DS_Store
+*.swp
+*~
+
+# Dotty
+*.class
+*.tasty
+*.hasTasty
+
+# sbt
+target/
+
+# Dotty IDE
+/.dotty-ide-artifact
+/.dotty-ide.json
+
+# datasets
+stackoverflow-grading.csv
+wikipedia-grading.dat
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -0,0 +1,36 @@
+# DO NOT EDIT THIS FILE
+
+stages:
+  - build
+  - grade
+
+compile:
+  stage: build
+  image: lampepfl/moocs:dotty-2020-02-12
+  except:
+    - tags
+  tags:
+    - cs206
+  script:
+    - sbt packageSubmission
+  artifacts:
+    expire_in: 1 day
+    paths:
+      - submission.jar
+
+grade:
+  stage: grade
+  except:
+    - tags
+  tags:
+    - cs206
+  image:
+    name: smarter3/moocs:bigdata-stackoverflow-2020-05-11-2
+    entrypoint: [""]
+  allow_failure: true
+  before_script:
+    - mkdir -p /shared/submission/
+    - cp submission.jar /shared/submission/submission.jar
+  script:
+    - cd /grader
+    - /grader/grade | /grader/feedback-printer
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,8 @@
+{
+  "dotty": {
+    "trace": {
+      "remoteTracingUrl": "wss://lamppc36.epfl.ch/dotty-remote-tracer/upload/lsp.log",
+      "server": { "format": "JSON", "verbosity": "verbose" }
+    }
+  }
+}
--- a/assignment.sbt
+++ b/assignment.sbt
@ -0,0 +1,9 @@
+// Student tasks (i.e. submit, packageSubmission)
+enablePlugins(StudentTasks)
+
+courseraId := ch.epfl.lamp.CourseraId(
+  key = "7ByAoS4kEea1yxIfJA1CUw",
+  itemId = "QhzMw",
+  premiumItemId = Some("FWGnz"),
+  partId = "OY5fJ"
+)
--- a/build.sbt
+++ b/build.sbt
@ -0,0 +1,20 @@
+course := "bigdata"
+assignment := "stackoverflow"
+
+scalaVersion := "0.24.0-RC1"
+scalacOptions ++= Seq("-language:implicitConversions", "-deprecation")
+libraryDependencies ++= Seq(
+  "com.novocode" % "junit-interface" % "0.11" % Test,
+  ("org.apache.spark" %% "spark-core" % "3.0.0-X1").withDottyCompat(scalaVersion.value),
+)
+
+// Contains Spark 3 snapshot built against 2.13: https://github.com/smarter/spark/tree/scala-2.13
+resolvers += Resolver.bintrayRepo("smarter", "maven")
+
+testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s")
+
+testSuite := "stackoverflow.StackOverflowSuite"
+
+// Without forking, ctrl-c doesn't actually fully stop Spark
+fork in run := true
+fork in Test := true
--- a/grading-tests.jar
+++ b/grading-tests.jar
--- a/project/MOOCSettings.scala
+++ b/project/MOOCSettings.scala
@ -0,0 +1,46 @@
+package ch.epfl.lamp
+
+import sbt._
+import sbt.Keys._
+
+/**
+ * Coursera uses two versions of each assignment. They both have the same assignment key and part id but have
+ * different item ids.
+ *
+ * @param key Assignment key
+ * @param partId Assignment partId
+ * @param itemId Item id of the non premium version
+ * @param premiumItemId Item id of the premium version (`None` if the assignment is optional)
+ */
+case class CourseraId(key: String, partId: String, itemId: String, premiumItemId: Option[String])
+
+/**
+  * Settings shared by all assignments, reused in various tasks.
+  */
+object MOOCSettings extends AutoPlugin {
+
+  object autoImport {
+    val course = SettingKey[String]("course")
+    val assignment = SettingKey[String]("assignment")
+    val options = SettingKey[Map[String, Map[String, String]]]("options")
+    val courseraId = settingKey[CourseraId]("Coursera-specific information identifying the assignment")
+    val testSuite = settingKey[String]("Fully qualified name of the test suite of this assignment")
+    // Convenient alias
+    type CourseraId = ch.epfl.lamp.CourseraId
+    val CourseraId = ch.epfl.lamp.CourseraId
+  }
+
+  import autoImport._
+
+  override val globalSettings: Seq[Def.Setting[_]] = Seq(
+    // supershell is verbose, buggy and useless.
+    useSuperShell := false
+  )
+
+  override val projectSettings: Seq[Def.Setting[_]] = Seq(
+    parallelExecution in Test := false,
+    // Report test result after each test instead of waiting for every test to finish
+    logBuffered in Test := false,
+    name := s"${course.value}-${assignment.value}"
+  )
+}
--- a/project/StudentTasks.scala
+++ b/project/StudentTasks.scala
@ -0,0 +1,318 @@
+package ch.epfl.lamp
+
+import sbt._
+import Keys._
+
+// import scalaj.http._
+import java.io.{File, FileInputStream, IOException}
+import org.apache.commons.codec.binary.Base64
+// import play.api.libs.json.{Json, JsObject, JsPath}
+import scala.util.{Failure, Success, Try}
+
+/**
+  * Provides tasks for submitting the assignment
+  */
+object StudentTasks extends AutoPlugin {
+
+  override def requires = super.requires && MOOCSettings
+
+  object autoImport {
+    val packageSourcesOnly = TaskKey[File]("packageSourcesOnly", "Package the sources of the project")
+    val packageBinWithoutResources = TaskKey[File]("packageBinWithoutResources", "Like packageBin, but without the resources")
+    val packageSubmissionZip = TaskKey[File]("packageSubmissionZip")
+    val packageSubmission = inputKey[Unit]("package solution as an archive file")
+    val runGradingTests = taskKey[Unit]("run black-box tests used for final grading")
+  }
+
+
+  import autoImport._
+  import MOOCSettings.autoImport._
+
+  override lazy val projectSettings = Seq(
+    packageSubmissionSetting,
+    // submitSetting,
+    runGradingTestsSettings,
+
+    fork := true,
+    connectInput in run := true,
+    outputStrategy := Some(StdoutOutput),
+  ) ++ packageSubmissionZipSettings
+
+  lazy val runGradingTestsSettings = runGradingTests := {
+    val testSuiteJar = "grading-tests.jar"
+    if (!new File(testSuiteJar).exists) {
+      throw new MessageOnlyException(s"Could not find tests JarFile: $testSuiteJar")
+    }
+
+    val classPath = s"${(Test / dependencyClasspath).value.map(_.data).mkString(File.pathSeparator)}${File.pathSeparator}$testSuiteJar"
+    val junitProcess =
+      Fork.java.fork(
+        ForkOptions(),
+        "-cp" :: classPath ::
+        "org.junit.runner.JUnitCore" ::
+        (Test / testSuite).value ::
+        Nil
+      )
+
+    // Wait for tests to complete.
+    junitProcess.exitValue()
+  }
+
+
+  /** **********************************************************
+    * SUBMITTING A SOLUTION TO COURSERA
+    */
+
+  val packageSubmissionZipSettings = Seq(
+    packageSubmissionZip := {
+      val submission = crossTarget.value / "submission.zip"
+      val sources = (packageSourcesOnly in Compile).value
+      val binaries = (packageBinWithoutResources in Compile).value
+      IO.zip(Seq(sources -> "sources.zip", binaries -> "binaries.jar"), submission)
+      submission
+    },
+    artifactClassifier in packageSourcesOnly := Some("sources"),
+    artifact in (Compile, packageBinWithoutResources) ~= (art => art.withName(art.name + "-without-resources"))
+  ) ++
+  inConfig(Compile)(
+    Defaults.packageTaskSettings(packageSourcesOnly, Defaults.sourceMappings) ++
+    Defaults.packageTaskSettings(packageBinWithoutResources, Def.task {
+      val relativePaths =
+        (unmanagedResources in Compile).value.flatMap(Path.relativeTo((unmanagedResourceDirectories in Compile).value)(_))
+      (mappings in (Compile, packageBin)).value.filterNot { case (_, path) => relativePaths.contains(path) }
+    })
+  )
+
+  val maxSubmitFileSize = {
+    val mb = 1024 * 1024
+    10 * mb
+  }
+
+  /** Check that the jar exists, isn't empty, isn't crazy big, and can be read
+    * If so, encode jar as base64 so we can send it to Coursera
+    */
+  def prepareJar(jar: File, s: TaskStreams): String = {
+    val errPrefix = "Error submitting assignment jar: "
+    val fileLength = jar.length()
+    if (!jar.exists()) {
+      s.log.error(errPrefix + "jar archive does not exist\n" + jar.getAbsolutePath)
+      failSubmit()
+    } else if (fileLength == 0L) {
+      s.log.error(errPrefix + "jar archive is empty\n" + jar.getAbsolutePath)
+      failSubmit()
+    } else if (fileLength > maxSubmitFileSize) {
+      s.log.error(errPrefix + "jar archive is too big. Allowed size: " +
+        maxSubmitFileSize + " bytes, found " + fileLength + " bytes.\n" +
+        jar.getAbsolutePath)
+      failSubmit()
+    } else {
+      val bytes = new Array[Byte](fileLength.toInt)
+      val sizeRead = try {
+        val is = new FileInputStream(jar)
+        val read = is.read(bytes)
+        is.close()
+        read
+      } catch {
+        case ex: IOException =>
+          s.log.error(errPrefix + "failed to read sources jar archive\n" + ex.toString)
+          failSubmit()
+      }
+      if (sizeRead != bytes.length) {
+        s.log.error(errPrefix + "failed to read the sources jar archive, size read: " + sizeRead)
+        failSubmit()
+      } else encodeBase64(bytes)
+    }
+  }
+
+  /** Task to package solution to a given file path */
+  lazy val packageSubmissionSetting = packageSubmission := {
+    val args: Seq[String] = Def.spaceDelimited("[path]").parsed
+    val s: TaskStreams = streams.value // for logging
+    val jar = (packageSubmissionZip in Compile).value
+
+    val base64Jar = prepareJar(jar, s)
+
+    val path = args.headOption.getOrElse((baseDirectory.value / "submission.jar").absolutePath)
+    scala.tools.nsc.io.File(path).writeAll(base64Jar)
+  }
+
+/*
+  /** Task to submit a solution to coursera */
+  val submit = inputKey[Unit]("submit solution to Coursera")
+  lazy val submitSetting = submit := {
+    // Fail if scalafix linting does not pass.
+    scalafixLinting.value
+
+    val args: Seq[String] = Def.spaceDelimited("<arg>").parsed
+    val s: TaskStreams = streams.value // for logging
+    val jar = (packageSubmissionZip in Compile).value
+
+    val assignmentDetails =
+      courseraId.?.value.getOrElse(throw new MessageOnlyException("This assignment can not be submitted to Coursera because the `courseraId` setting is undefined"))
+    val assignmentKey = assignmentDetails.key
+    val courseName =
+      course.value match {
+        case "capstone" => "scala-capstone"
+        case "bigdata"  => "scala-spark-big-data"
+        case other      => other
+      }
+
+    val partId = assignmentDetails.partId
+    val itemId = assignmentDetails.itemId
+    val premiumItemId = assignmentDetails.premiumItemId
+
+    val (email, secret) = args match {
+      case email :: secret :: Nil =>
+        (email, secret)
+      case _ =>
+        val inputErr =
+          s"""|Invalid input to `submit`. The required syntax for `submit` is:
+              |submit <email-address> <submit-token>
+              |
+              |The submit token is NOT YOUR LOGIN PASSWORD.
+              |It can be obtained from the assignment page:
+              |https://www.coursera.org/learn/$courseName/programming/$itemId
+              |${
+                premiumItemId.fold("") { id =>
+                  s"""or (for premium learners):
+                     |https://www.coursera.org/learn/$courseName/programming/$id
+                   """.stripMargin
+                }
+              }
+          """.stripMargin
+        s.log.error(inputErr)
+        failSubmit()
+    }
+
+    val base64Jar = prepareJar(jar, s)
+    val json =
+      s"""|{
+          |   "assignmentKey":"$assignmentKey",
+          |   "submitterEmail":"$email",
+          |   "secret":"$secret",
+          |   "parts":{
+          |      "$partId":{
+          |         "output":"$base64Jar"
+          |      }
+          |   }
+          |}""".stripMargin
+
+    def postSubmission[T](data: String): Try[HttpResponse[String]] = {
+      val http = Http("https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1")
+      val hs = List(
+        ("Cache-Control", "no-cache"),
+        ("Content-Type", "application/json")
+      )
+      s.log.info("Connecting to Coursera...")
+      val response = Try(http.postData(data)
+                         .headers(hs)
+                         .option(HttpOptions.connTimeout(10000)) // scalaj default timeout is only 100ms, changing that to 10s
+                         .asString) // kick off HTTP POST
+      response
+    }
+
+    val connectMsg =
+      s"""|Attempting to submit "${assignment.value}" assignment in "$courseName" course
+          |Using:
+          |- email: $email
+          |- submit token: $secret""".stripMargin
+    s.log.info(connectMsg)
+
+    def reportCourseraResponse(response: HttpResponse[String]): Unit = {
+      val code = response.code
+      val respBody = response.body
+
+       /* Sample JSON response from Coursera
+      {
+        "message": "Invalid email or token.",
+        "details": {
+          "learnerMessage": "Invalid email or token."
+        }
+      }
+      */
+
+      // Success, Coursera responds with 2xx HTTP status code
+      if (response.is2xx) {
+        val successfulSubmitMsg =
+          s"""|Successfully connected to Coursera. (Status $code)
+              |
+                |Assignment submitted successfully!
+              |
+                |You can see how you scored by going to:
+              |https://www.coursera.org/learn/$courseName/programming/$itemId/
+              |${
+            premiumItemId.fold("") { id =>
+              s"""or (for premium learners):
+                 |https://www.coursera.org/learn/$courseName/programming/$id
+                       """.stripMargin
+            }
+          }
+              |and clicking on "My Submission".""".stripMargin
+        s.log.info(successfulSubmitMsg)
+      }
+
+      // Failure, Coursera responds with 4xx HTTP status code (client-side failure)
+      else if (response.is4xx) {
+        val result = Try(Json.parse(respBody)).toOption
+        val learnerMsg = result match {
+          case Some(resp: JsObject) =>
+            (JsPath \ "details" \ "learnerMessage").read[String].reads(resp).get
+          case Some(x) => // shouldn't happen
+            "Could not parse Coursera's response:\n" + x
+          case None =>
+            "Could not parse Coursera's response:\n" + respBody
+        }
+        val failedSubmitMsg =
+          s"""|Submission failed.
+              |There was something wrong while attempting to submit.
+              |Coursera says:
+              |$learnerMsg (Status $code)""".stripMargin
+        s.log.error(failedSubmitMsg)
+      }
+
+      // Failure, Coursera responds with 5xx HTTP status code (server-side failure)
+      else if (response.is5xx) {
+        val failedSubmitMsg =
+          s"""|Submission failed.
+              |Coursera seems to be unavailable at the moment (Status $code)
+              |Check https://status.coursera.org/ and try again in a few minutes.
+           """.stripMargin
+        s.log.error(failedSubmitMsg)
+      }
+
+      // Failure, Coursera repsonds with an unexpected status code
+      else {
+        val failedSubmitMsg =
+          s"""|Submission failed.
+              |Coursera replied with an unexpected code (Status $code)
+           """.stripMargin
+        s.log.error(failedSubmitMsg)
+      }
+    }
+
+    // kick it all off, actually make request
+    postSubmission(json) match {
+      case Success(resp) => reportCourseraResponse(resp)
+      case Failure(e) =>
+        val failedConnectMsg =
+          s"""|Connection to Coursera failed.
+              |There was something wrong while attempting to connect to Coursera.
+              |Check your internet connection.
+              |${e.toString}""".stripMargin
+        s.log.error(failedConnectMsg)
+    }
+
+   }
+*/
+
+  def failSubmit(): Nothing = {
+    sys.error("Submission failed")
+  }
+
+  /**
+    * *****************
+    * DEALING WITH JARS
+    */
+  def encodeBase64(bytes: Array[Byte]): String =
+    new String(Base64.encodeBase64(bytes))
+}
--- a/project/build.properties
+++ b/project/build.properties
@ -0,0 +1 @@
+sbt.version=1.3.8
--- a/project/buildSettings.sbt
+++ b/project/buildSettings.sbt
@ -0,0 +1,5 @@
+// Used for Coursera submission (StudentPlugin)
+// libraryDependencies += "org.scalaj" %% "scalaj-http" % "2.4.2"
+// libraryDependencies += "com.typesafe.play" %% "play-json" % "2.7.4"
+// Used for Base64 (StudentPlugin)
+libraryDependencies += "commons-codec" % "commons-codec" % "1.10"
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@ -0,0 +1,2 @@
+addSbtPlugin("org.scala-js" % "sbt-scalajs" % "0.6.28")
+addSbtPlugin("ch.epfl.lamp" % "sbt-dotty" % "0.4.0")
--- a/src/main/resources/stackoverflow/.keep
+++ b/src/main/resources/stackoverflow/.keep
--- a/src/main/scala/stackoverflow/StackOverflow.scala
+++ b/src/main/scala/stackoverflow/StackOverflow.scala
@ -0,0 +1,308 @@
+package stackoverflow
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.apache.log4j.{Logger, Level}
+
+import annotation.tailrec
+import scala.reflect.ClassTag
+import scala.util.Properties.isWin
+
+type Question = Posting
+type Answer = Posting
+type QID = Int
+type HighScore = Int
+type LangIndex = Int
+
+/** A raw stackoverflow posting, either a question or an answer */
+case class Posting(postingType: Int, id: Int, acceptedAnswer: Option[Int], parentId: Option[QID], score: Int, tags: Option[String]) extends Serializable
+
+/** The main class */
+object StackOverflow extends StackOverflow {
+
+  // Reduce Spark logging verbosity
+  Logger.getLogger("org").setLevel(Level.ERROR)
+
+  if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
+
+  @transient lazy val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
+  @transient lazy val sc: SparkContext = new SparkContext(conf)
+
+  /** Main function */
+  def main(args: Array[String]): Unit = {
+
+    val lines   = sc.textFile("src/main/resources/stackoverflow/stackoverflow-grading.csv")
+    val raw     = rawPostings(lines)
+    val grouped = groupedPostings(raw)
+    val scored  = scoredPostings(grouped)
+    val vectors = vectorPostings(scored)
+//    assert(vectors.count() == 2121822, "Incorrect number of vectors: " + vectors.count())
+
+    val means   = kmeans(sampleVectors(vectors), vectors, debug = true)
+    val results = clusterResults(means, vectors)
+    printResults(results)
+  }
+}
+
+/** The parsing and kmeans methods */
+class StackOverflow extends Serializable {
+
+  /** Languages */
+  val langs =
+    List(
+      "JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
+      "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
+
+  /** K-means parameter: How "far apart" languages should be for the kmeans algorithm? */
+  def langSpread = 50000
+  assert(langSpread > 0, "If langSpread is zero we can't recover the language from the input data!")
+
+  /** K-means parameter: Number of clusters */
+  def kmeansKernels = 45
+
+  /** K-means parameter: Convergence criteria */
+  def kmeansEta: Double = 20.0D
+
+  /** K-means parameter: Maximum iterations */
+  def kmeansMaxIterations = 120
+
+
+  //
+  //
+  // Parsing utilities:
+  //
+  //
+
+  /** Load postings from the given file */
+  def rawPostings(lines: RDD[String]): RDD[Posting] =
+    lines.map(line => {
+      val arr = line.split(",")
+      Posting(postingType =    arr(0).toInt,
+              id =             arr(1).toInt,
+              acceptedAnswer = if (arr(2) == "") None else Some(arr(2).toInt),
+              parentId =       if (arr(3) == "") None else Some(arr(3).toInt),
+              score =          arr(4).toInt,
+              tags =           if (arr.length >= 6) Some(arr(5).intern()) else None)
+    })
+
+
+  /** Group the questions and answers together */
+  def groupedPostings(postings: RDD[Posting]): RDD[(QID, Iterable[(Question, Answer)])] = {
+    ???
+  }
+
+
+  /** Compute the maximum score for each posting */
+  def scoredPostings(grouped: RDD[(QID, Iterable[(Question, Answer)])]): RDD[(Question, HighScore)] = {
+
+    def answerHighScore(as: Array[Answer]): HighScore = {
+      var highScore = 0
+      var i = 0
+      while (i < as.length) {
+        val score = as(i).score
+        if (score > highScore)
+          highScore = score
+        i += 1
+      }
+      highScore
+    }
+
+    ???
+  }
+
+
+  /** Compute the vectors for the kmeans */
+  def vectorPostings(scored: RDD[(Question, HighScore)]): RDD[(LangIndex, HighScore)] = {
+    /** Return optional index of first language that occurs in `tags`. */
+    def firstLangInTag(tag: Option[String], ls: List[String]): Option[Int] = {
+      if (tag.isEmpty) None
+      else if (ls.isEmpty) None
+      else if (tag.get == ls.head) Some(0) // index: 0
+      else {
+        val tmp = firstLangInTag(tag, ls.tail)
+        tmp match {
+          case None => None
+          case Some(i) => Some(i + 1) // index i in ls.tail => index i+1
+        }
+      }
+    }
+
+    ???
+  }
+
+
+  /** Sample the vectors */
+  def sampleVectors(vectors: RDD[(LangIndex, HighScore)]): Array[(Int, Int)] = {
+
+    assert(kmeansKernels % langs.length == 0, "kmeansKernels should be a multiple of the number of languages studied.")
+    val perLang = kmeansKernels / langs.length
+
+    // http://en.wikipedia.org/wiki/Reservoir_sampling
+    def reservoirSampling(lang: Int, iter: Iterator[Int], size: Int): Array[Int] = {
+      val res = new Array[Int](size)
+      val rnd = new util.Random(lang)
+
+      for (i <- 0 until size) {
+        assert(iter.hasNext, s"iterator must have at least $size elements")
+        res(i) = iter.next
+      }
+
+      var i = size.toLong
+      while (iter.hasNext) {
+        val elt = iter.next
+        val j = math.abs(rnd.nextLong) % i
+        if (j < size)
+          res(j.toInt) = elt
+        i += 1
+      }
+
+      res
+    }
+
+    val res =
+      if (langSpread < 500)
+        // sample the space regardless of the language
+        vectors.takeSample(false, kmeansKernels, 42)
+      else
+        // sample the space uniformly from each language partition
+        vectors.groupByKey.flatMap({
+          case (lang, vectors) => reservoirSampling(lang, vectors.iterator, perLang).map((lang, _))
+        }).collect()
+
+    assert(res.length == kmeansKernels, res.length)
+    res
+  }
+
+
+  //
+  //
+  //  Kmeans method:
+  //
+  //
+
+  /** Main kmeans computation */
+  @tailrec final def kmeans(means: Array[(Int, Int)], vectors: RDD[(Int, Int)], iter: Int = 1, debug: Boolean = false): Array[(Int, Int)] = {
+    val newMeans = means.clone() // you need to compute newMeans
+
+    // TODO: Fill in the newMeans array
+    val distance = euclideanDistance(means, newMeans)
+
+    if (debug) {
+      println(s"""Iteration: $iter
+                 |  * current distance: $distance
+                 |  * desired distance: $kmeansEta
+                 |  * means:""".stripMargin)
+      for (idx <- 0 until kmeansKernels)
+      println(f"   ${means(idx).toString}%20s ==> ${newMeans(idx).toString}%20s  " +
+              f"  distance: ${euclideanDistance(means(idx), newMeans(idx))}%8.0f")
+    }
+
+    if (converged(distance))
+      newMeans
+    else if (iter < kmeansMaxIterations)
+      kmeans(newMeans, vectors, iter + 1, debug)
+    else {
+      if (debug) {
+        println("Reached max iterations!")
+      }
+      newMeans
+    }
+  }
+
+
+
+
+  //
+  //
+  //  Kmeans utilities:
+  //
+  //
+
+  /** Decide whether the kmeans clustering converged */
+  def converged(distance: Double) =
+    distance < kmeansEta
+
+
+  /** Return the euclidean distance between two points */
+  def euclideanDistance(v1: (Int, Int), v2: (Int, Int)): Double = {
+    val part1 = (v1._1 - v2._1).toDouble * (v1._1 - v2._1)
+    val part2 = (v1._2 - v2._2).toDouble * (v1._2 - v2._2)
+    part1 + part2
+  }
+
+  /** Return the euclidean distance between two points */
+  def euclideanDistance(a1: Array[(Int, Int)], a2: Array[(Int, Int)]): Double = {
+    assert(a1.length == a2.length)
+    var sum = 0d
+    var idx = 0
+    while(idx < a1.length) {
+      sum += euclideanDistance(a1(idx), a2(idx))
+      idx += 1
+    }
+    sum
+  }
+
+  /** Return the closest point */
+  def findClosest(p: (Int, Int), centers: Array[(Int, Int)]): Int = {
+    var bestIndex = 0
+    var closest = Double.PositiveInfinity
+    for (i <- 0 until centers.length) {
+      val tempDist = euclideanDistance(p, centers(i))
+      if (tempDist < closest) {
+        closest = tempDist
+        bestIndex = i
+      }
+    }
+    bestIndex
+  }
+
+
+  /** Average the vectors */
+  def averageVectors(ps: Iterable[(Int, Int)]): (Int, Int) = {
+    val iter = ps.iterator
+    var count = 0
+    var comp1: Long = 0
+    var comp2: Long = 0
+    while (iter.hasNext) {
+      val item = iter.next
+      comp1 += item._1
+      comp2 += item._2
+      count += 1
+    }
+    ((comp1 / count).toInt, (comp2 / count).toInt)
+  }
+
+
+
+
+  //
+  //
+  //  Displaying results:
+  //
+  //
+  def clusterResults(means: Array[(Int, Int)], vectors: RDD[(LangIndex, HighScore)]): Array[(String, Double, Int, Int)] = {
+    val closest = vectors.map(p => (findClosest(p, means), p))
+    val closestGrouped = closest.groupByKey()
+
+    val median = closestGrouped.mapValues { vs =>
+      val langLabel: String   = ??? // most common language in the cluster
+      val langPercent: Double = ??? // percent of the questions in the most common language
+      val clusterSize: Int    = ???
+      val medianScore: Int    = ???
+
+      (langLabel, langPercent, clusterSize, medianScore)
+    }
+
+    median.collect().map(_._2).sortBy(_._4)
+  }
+
+  def printResults(results: Array[(String, Double, Int, Int)]): Unit = {
+    println("Resulting clusters:")
+    println("  Score  Dominant language (%percent)  Questions")
+    println("================================================")
+    for ((lang, percent, size, score) <- results)
+      println(f"${score}%7d  ${lang}%-17s (${percent}%-5.1f%%)      ${size}%7d")
+  }
+}
--- a/src/test/scala/stackoverflow/StackOverflowSuite.scala
+++ b/src/test/scala/stackoverflow/StackOverflowSuite.scala
@ -0,0 +1,47 @@
+package stackoverflow
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.junit._
+import org.junit.Assert.assertEquals
+import java.io.File
+import scala.io.{ Codec, Source }
+import scala.util.Properties.isWin
+
+object StackOverflowSuite {
+  if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
+
+  val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
+  val sc: SparkContext = new SparkContext(conf)
+}
+
+class StackOverflowSuite {
+  import StackOverflowSuite._
+
+
+  lazy val testObject = new StackOverflow {
+    override val langs =
+      List(
+        "JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
+        "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
+    override def langSpread = 50000
+    override def kmeansKernels = 45
+    override def kmeansEta: Double = 20.0D
+    override def kmeansMaxIterations = 120
+  }
+
+  @Test def `testObject can be instantiated`: Unit = {
+    val instantiatable = try {
+      testObject
+      true
+    } catch {
+      case _: Throwable => false
+    }
+    assert(instantiatable, "Can't instantiate a StackOverflow object")
+  }
+
+
+  @Rule def individualTestTimeout = new org.junit.rules.Timeout(300 * 1000)
+}
--- a/winutils/hadoop-2.7.4/bin/winutils.exe
+++ b/winutils/hadoop-2.7.4/bin/winutils.exe