commit 0a94b65038a0a6bc9a919afeae35d0bc651995b3 Author: Guillaume Martres Date: Tue Feb 19 20:44:23 2019 +0100 Add stackoverflow assignment diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..996b5d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +# General +*.DS_Store +*.swp +*~ + +# Dotty +*.class +*.tasty +*.hasTasty + +# sbt +target/ + +# Dotty IDE +/.dotty-ide-artifact +/.dotty-ide.json + +# datasets +stackoverflow-grading.csv +wikipedia-grading.dat diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..d074697 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,36 @@ +# DO NOT EDIT THIS FILE + +stages: + - build + - grade + +compile: + stage: build + image: lampepfl/moocs:dotty-2020-02-12 + except: + - tags + tags: + - cs206 + script: + - sbt packageSubmission + artifacts: + expire_in: 1 day + paths: + - submission.jar + +grade: + stage: grade + except: + - tags + tags: + - cs206 + image: + name: smarter3/moocs:bigdata-stackoverflow-2020-05-11-2 + entrypoint: [""] + allow_failure: true + before_script: + - mkdir -p /shared/submission/ + - cp submission.jar /shared/submission/submission.jar + script: + - cd /grader + - /grader/grade | /grader/feedback-printer diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a35362b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "dotty": { + "trace": { + "remoteTracingUrl": "wss://lamppc36.epfl.ch/dotty-remote-tracer/upload/lsp.log", + "server": { "format": "JSON", "verbosity": "verbose" } + } + } +} diff --git a/assignment.sbt b/assignment.sbt new file mode 100644 index 0000000..54a90fe --- /dev/null +++ b/assignment.sbt @@ -0,0 +1,9 @@ +// Student tasks (i.e. submit, packageSubmission) +enablePlugins(StudentTasks) + +courseraId := ch.epfl.lamp.CourseraId( + key = "7ByAoS4kEea1yxIfJA1CUw", + itemId = "QhzMw", + premiumItemId = Some("FWGnz"), + partId = "OY5fJ" +) diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..32b23dd --- /dev/null +++ b/build.sbt @@ -0,0 +1,20 @@ +course := "bigdata" +assignment := "stackoverflow" + +scalaVersion := "0.24.0-RC1" +scalacOptions ++= Seq("-language:implicitConversions", "-deprecation") +libraryDependencies ++= Seq( + "com.novocode" % "junit-interface" % "0.11" % Test, + ("org.apache.spark" %% "spark-core" % "3.0.0-X1").withDottyCompat(scalaVersion.value), +) + +// Contains Spark 3 snapshot built against 2.13: https://github.com/smarter/spark/tree/scala-2.13 +resolvers += Resolver.bintrayRepo("smarter", "maven") + +testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s") + +testSuite := "stackoverflow.StackOverflowSuite" + +// Without forking, ctrl-c doesn't actually fully stop Spark +fork in run := true +fork in Test := true diff --git a/grading-tests.jar b/grading-tests.jar new file mode 100644 index 0000000..5564ba8 Binary files /dev/null and b/grading-tests.jar differ diff --git a/project/MOOCSettings.scala b/project/MOOCSettings.scala new file mode 100644 index 0000000..171244f --- /dev/null +++ b/project/MOOCSettings.scala @@ -0,0 +1,46 @@ +package ch.epfl.lamp + +import sbt._ +import sbt.Keys._ + +/** + * Coursera uses two versions of each assignment. They both have the same assignment key and part id but have + * different item ids. + * + * @param key Assignment key + * @param partId Assignment partId + * @param itemId Item id of the non premium version + * @param premiumItemId Item id of the premium version (`None` if the assignment is optional) + */ +case class CourseraId(key: String, partId: String, itemId: String, premiumItemId: Option[String]) + +/** + * Settings shared by all assignments, reused in various tasks. + */ +object MOOCSettings extends AutoPlugin { + + object autoImport { + val course = SettingKey[String]("course") + val assignment = SettingKey[String]("assignment") + val options = SettingKey[Map[String, Map[String, String]]]("options") + val courseraId = settingKey[CourseraId]("Coursera-specific information identifying the assignment") + val testSuite = settingKey[String]("Fully qualified name of the test suite of this assignment") + // Convenient alias + type CourseraId = ch.epfl.lamp.CourseraId + val CourseraId = ch.epfl.lamp.CourseraId + } + + import autoImport._ + + override val globalSettings: Seq[Def.Setting[_]] = Seq( + // supershell is verbose, buggy and useless. + useSuperShell := false + ) + + override val projectSettings: Seq[Def.Setting[_]] = Seq( + parallelExecution in Test := false, + // Report test result after each test instead of waiting for every test to finish + logBuffered in Test := false, + name := s"${course.value}-${assignment.value}" + ) +} diff --git a/project/StudentTasks.scala b/project/StudentTasks.scala new file mode 100644 index 0000000..7604830 --- /dev/null +++ b/project/StudentTasks.scala @@ -0,0 +1,318 @@ +package ch.epfl.lamp + +import sbt._ +import Keys._ + +// import scalaj.http._ +import java.io.{File, FileInputStream, IOException} +import org.apache.commons.codec.binary.Base64 +// import play.api.libs.json.{Json, JsObject, JsPath} +import scala.util.{Failure, Success, Try} + +/** + * Provides tasks for submitting the assignment + */ +object StudentTasks extends AutoPlugin { + + override def requires = super.requires && MOOCSettings + + object autoImport { + val packageSourcesOnly = TaskKey[File]("packageSourcesOnly", "Package the sources of the project") + val packageBinWithoutResources = TaskKey[File]("packageBinWithoutResources", "Like packageBin, but without the resources") + val packageSubmissionZip = TaskKey[File]("packageSubmissionZip") + val packageSubmission = inputKey[Unit]("package solution as an archive file") + val runGradingTests = taskKey[Unit]("run black-box tests used for final grading") + } + + + import autoImport._ + import MOOCSettings.autoImport._ + + override lazy val projectSettings = Seq( + packageSubmissionSetting, + // submitSetting, + runGradingTestsSettings, + + fork := true, + connectInput in run := true, + outputStrategy := Some(StdoutOutput), + ) ++ packageSubmissionZipSettings + + lazy val runGradingTestsSettings = runGradingTests := { + val testSuiteJar = "grading-tests.jar" + if (!new File(testSuiteJar).exists) { + throw new MessageOnlyException(s"Could not find tests JarFile: $testSuiteJar") + } + + val classPath = s"${(Test / dependencyClasspath).value.map(_.data).mkString(File.pathSeparator)}${File.pathSeparator}$testSuiteJar" + val junitProcess = + Fork.java.fork( + ForkOptions(), + "-cp" :: classPath :: + "org.junit.runner.JUnitCore" :: + (Test / testSuite).value :: + Nil + ) + + // Wait for tests to complete. + junitProcess.exitValue() + } + + + /** ********************************************************** + * SUBMITTING A SOLUTION TO COURSERA + */ + + val packageSubmissionZipSettings = Seq( + packageSubmissionZip := { + val submission = crossTarget.value / "submission.zip" + val sources = (packageSourcesOnly in Compile).value + val binaries = (packageBinWithoutResources in Compile).value + IO.zip(Seq(sources -> "sources.zip", binaries -> "binaries.jar"), submission) + submission + }, + artifactClassifier in packageSourcesOnly := Some("sources"), + artifact in (Compile, packageBinWithoutResources) ~= (art => art.withName(art.name + "-without-resources")) + ) ++ + inConfig(Compile)( + Defaults.packageTaskSettings(packageSourcesOnly, Defaults.sourceMappings) ++ + Defaults.packageTaskSettings(packageBinWithoutResources, Def.task { + val relativePaths = + (unmanagedResources in Compile).value.flatMap(Path.relativeTo((unmanagedResourceDirectories in Compile).value)(_)) + (mappings in (Compile, packageBin)).value.filterNot { case (_, path) => relativePaths.contains(path) } + }) + ) + + val maxSubmitFileSize = { + val mb = 1024 * 1024 + 10 * mb + } + + /** Check that the jar exists, isn't empty, isn't crazy big, and can be read + * If so, encode jar as base64 so we can send it to Coursera + */ + def prepareJar(jar: File, s: TaskStreams): String = { + val errPrefix = "Error submitting assignment jar: " + val fileLength = jar.length() + if (!jar.exists()) { + s.log.error(errPrefix + "jar archive does not exist\n" + jar.getAbsolutePath) + failSubmit() + } else if (fileLength == 0L) { + s.log.error(errPrefix + "jar archive is empty\n" + jar.getAbsolutePath) + failSubmit() + } else if (fileLength > maxSubmitFileSize) { + s.log.error(errPrefix + "jar archive is too big. Allowed size: " + + maxSubmitFileSize + " bytes, found " + fileLength + " bytes.\n" + + jar.getAbsolutePath) + failSubmit() + } else { + val bytes = new Array[Byte](fileLength.toInt) + val sizeRead = try { + val is = new FileInputStream(jar) + val read = is.read(bytes) + is.close() + read + } catch { + case ex: IOException => + s.log.error(errPrefix + "failed to read sources jar archive\n" + ex.toString) + failSubmit() + } + if (sizeRead != bytes.length) { + s.log.error(errPrefix + "failed to read the sources jar archive, size read: " + sizeRead) + failSubmit() + } else encodeBase64(bytes) + } + } + + /** Task to package solution to a given file path */ + lazy val packageSubmissionSetting = packageSubmission := { + val args: Seq[String] = Def.spaceDelimited("[path]").parsed + val s: TaskStreams = streams.value // for logging + val jar = (packageSubmissionZip in Compile).value + + val base64Jar = prepareJar(jar, s) + + val path = args.headOption.getOrElse((baseDirectory.value / "submission.jar").absolutePath) + scala.tools.nsc.io.File(path).writeAll(base64Jar) + } + +/* + /** Task to submit a solution to coursera */ + val submit = inputKey[Unit]("submit solution to Coursera") + lazy val submitSetting = submit := { + // Fail if scalafix linting does not pass. + scalafixLinting.value + + val args: Seq[String] = Def.spaceDelimited("").parsed + val s: TaskStreams = streams.value // for logging + val jar = (packageSubmissionZip in Compile).value + + val assignmentDetails = + courseraId.?.value.getOrElse(throw new MessageOnlyException("This assignment can not be submitted to Coursera because the `courseraId` setting is undefined")) + val assignmentKey = assignmentDetails.key + val courseName = + course.value match { + case "capstone" => "scala-capstone" + case "bigdata" => "scala-spark-big-data" + case other => other + } + + val partId = assignmentDetails.partId + val itemId = assignmentDetails.itemId + val premiumItemId = assignmentDetails.premiumItemId + + val (email, secret) = args match { + case email :: secret :: Nil => + (email, secret) + case _ => + val inputErr = + s"""|Invalid input to `submit`. The required syntax for `submit` is: + |submit + | + |The submit token is NOT YOUR LOGIN PASSWORD. + |It can be obtained from the assignment page: + |https://www.coursera.org/learn/$courseName/programming/$itemId + |${ + premiumItemId.fold("") { id => + s"""or (for premium learners): + |https://www.coursera.org/learn/$courseName/programming/$id + """.stripMargin + } + } + """.stripMargin + s.log.error(inputErr) + failSubmit() + } + + val base64Jar = prepareJar(jar, s) + val json = + s"""|{ + | "assignmentKey":"$assignmentKey", + | "submitterEmail":"$email", + | "secret":"$secret", + | "parts":{ + | "$partId":{ + | "output":"$base64Jar" + | } + | } + |}""".stripMargin + + def postSubmission[T](data: String): Try[HttpResponse[String]] = { + val http = Http("https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1") + val hs = List( + ("Cache-Control", "no-cache"), + ("Content-Type", "application/json") + ) + s.log.info("Connecting to Coursera...") + val response = Try(http.postData(data) + .headers(hs) + .option(HttpOptions.connTimeout(10000)) // scalaj default timeout is only 100ms, changing that to 10s + .asString) // kick off HTTP POST + response + } + + val connectMsg = + s"""|Attempting to submit "${assignment.value}" assignment in "$courseName" course + |Using: + |- email: $email + |- submit token: $secret""".stripMargin + s.log.info(connectMsg) + + def reportCourseraResponse(response: HttpResponse[String]): Unit = { + val code = response.code + val respBody = response.body + + /* Sample JSON response from Coursera + { + "message": "Invalid email or token.", + "details": { + "learnerMessage": "Invalid email or token." + } + } + */ + + // Success, Coursera responds with 2xx HTTP status code + if (response.is2xx) { + val successfulSubmitMsg = + s"""|Successfully connected to Coursera. (Status $code) + | + |Assignment submitted successfully! + | + |You can see how you scored by going to: + |https://www.coursera.org/learn/$courseName/programming/$itemId/ + |${ + premiumItemId.fold("") { id => + s"""or (for premium learners): + |https://www.coursera.org/learn/$courseName/programming/$id + """.stripMargin + } + } + |and clicking on "My Submission".""".stripMargin + s.log.info(successfulSubmitMsg) + } + + // Failure, Coursera responds with 4xx HTTP status code (client-side failure) + else if (response.is4xx) { + val result = Try(Json.parse(respBody)).toOption + val learnerMsg = result match { + case Some(resp: JsObject) => + (JsPath \ "details" \ "learnerMessage").read[String].reads(resp).get + case Some(x) => // shouldn't happen + "Could not parse Coursera's response:\n" + x + case None => + "Could not parse Coursera's response:\n" + respBody + } + val failedSubmitMsg = + s"""|Submission failed. + |There was something wrong while attempting to submit. + |Coursera says: + |$learnerMsg (Status $code)""".stripMargin + s.log.error(failedSubmitMsg) + } + + // Failure, Coursera responds with 5xx HTTP status code (server-side failure) + else if (response.is5xx) { + val failedSubmitMsg = + s"""|Submission failed. + |Coursera seems to be unavailable at the moment (Status $code) + |Check https://status.coursera.org/ and try again in a few minutes. + """.stripMargin + s.log.error(failedSubmitMsg) + } + + // Failure, Coursera repsonds with an unexpected status code + else { + val failedSubmitMsg = + s"""|Submission failed. + |Coursera replied with an unexpected code (Status $code) + """.stripMargin + s.log.error(failedSubmitMsg) + } + } + + // kick it all off, actually make request + postSubmission(json) match { + case Success(resp) => reportCourseraResponse(resp) + case Failure(e) => + val failedConnectMsg = + s"""|Connection to Coursera failed. + |There was something wrong while attempting to connect to Coursera. + |Check your internet connection. + |${e.toString}""".stripMargin + s.log.error(failedConnectMsg) + } + + } +*/ + + def failSubmit(): Nothing = { + sys.error("Submission failed") + } + + /** + * ***************** + * DEALING WITH JARS + */ + def encodeBase64(bytes: Array[Byte]): String = + new String(Base64.encodeBase64(bytes)) +} diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 0000000..a919a9b --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.3.8 diff --git a/project/buildSettings.sbt b/project/buildSettings.sbt new file mode 100644 index 0000000..8fac702 --- /dev/null +++ b/project/buildSettings.sbt @@ -0,0 +1,5 @@ +// Used for Coursera submission (StudentPlugin) +// libraryDependencies += "org.scalaj" %% "scalaj-http" % "2.4.2" +// libraryDependencies += "com.typesafe.play" %% "play-json" % "2.7.4" +// Used for Base64 (StudentPlugin) +libraryDependencies += "commons-codec" % "commons-codec" % "1.10" diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..017735d --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,2 @@ +addSbtPlugin("org.scala-js" % "sbt-scalajs" % "0.6.28") +addSbtPlugin("ch.epfl.lamp" % "sbt-dotty" % "0.4.0") diff --git a/src/main/resources/stackoverflow/.keep b/src/main/resources/stackoverflow/.keep new file mode 100644 index 0000000..e69de29 diff --git a/src/main/scala/stackoverflow/StackOverflow.scala b/src/main/scala/stackoverflow/StackOverflow.scala new file mode 100644 index 0000000..42a75b7 --- /dev/null +++ b/src/main/scala/stackoverflow/StackOverflow.scala @@ -0,0 +1,308 @@ +package stackoverflow + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.RDD +import org.apache.log4j.{Logger, Level} + +import annotation.tailrec +import scala.reflect.ClassTag +import scala.util.Properties.isWin + +type Question = Posting +type Answer = Posting +type QID = Int +type HighScore = Int +type LangIndex = Int + +/** A raw stackoverflow posting, either a question or an answer */ +case class Posting(postingType: Int, id: Int, acceptedAnswer: Option[Int], parentId: Option[QID], score: Int, tags: Option[String]) extends Serializable + +/** The main class */ +object StackOverflow extends StackOverflow { + + // Reduce Spark logging verbosity + Logger.getLogger("org").setLevel(Level.ERROR) + + if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4") + + @transient lazy val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow") + @transient lazy val sc: SparkContext = new SparkContext(conf) + + /** Main function */ + def main(args: Array[String]): Unit = { + + val lines = sc.textFile("src/main/resources/stackoverflow/stackoverflow-grading.csv") + val raw = rawPostings(lines) + val grouped = groupedPostings(raw) + val scored = scoredPostings(grouped) + val vectors = vectorPostings(scored) +// assert(vectors.count() == 2121822, "Incorrect number of vectors: " + vectors.count()) + + val means = kmeans(sampleVectors(vectors), vectors, debug = true) + val results = clusterResults(means, vectors) + printResults(results) + } +} + +/** The parsing and kmeans methods */ +class StackOverflow extends Serializable { + + /** Languages */ + val langs = + List( + "JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS", + "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy") + + /** K-means parameter: How "far apart" languages should be for the kmeans algorithm? */ + def langSpread = 50000 + assert(langSpread > 0, "If langSpread is zero we can't recover the language from the input data!") + + /** K-means parameter: Number of clusters */ + def kmeansKernels = 45 + + /** K-means parameter: Convergence criteria */ + def kmeansEta: Double = 20.0D + + /** K-means parameter: Maximum iterations */ + def kmeansMaxIterations = 120 + + + // + // + // Parsing utilities: + // + // + + /** Load postings from the given file */ + def rawPostings(lines: RDD[String]): RDD[Posting] = + lines.map(line => { + val arr = line.split(",") + Posting(postingType = arr(0).toInt, + id = arr(1).toInt, + acceptedAnswer = if (arr(2) == "") None else Some(arr(2).toInt), + parentId = if (arr(3) == "") None else Some(arr(3).toInt), + score = arr(4).toInt, + tags = if (arr.length >= 6) Some(arr(5).intern()) else None) + }) + + + /** Group the questions and answers together */ + def groupedPostings(postings: RDD[Posting]): RDD[(QID, Iterable[(Question, Answer)])] = { + ??? + } + + + /** Compute the maximum score for each posting */ + def scoredPostings(grouped: RDD[(QID, Iterable[(Question, Answer)])]): RDD[(Question, HighScore)] = { + + def answerHighScore(as: Array[Answer]): HighScore = { + var highScore = 0 + var i = 0 + while (i < as.length) { + val score = as(i).score + if (score > highScore) + highScore = score + i += 1 + } + highScore + } + + ??? + } + + + /** Compute the vectors for the kmeans */ + def vectorPostings(scored: RDD[(Question, HighScore)]): RDD[(LangIndex, HighScore)] = { + /** Return optional index of first language that occurs in `tags`. */ + def firstLangInTag(tag: Option[String], ls: List[String]): Option[Int] = { + if (tag.isEmpty) None + else if (ls.isEmpty) None + else if (tag.get == ls.head) Some(0) // index: 0 + else { + val tmp = firstLangInTag(tag, ls.tail) + tmp match { + case None => None + case Some(i) => Some(i + 1) // index i in ls.tail => index i+1 + } + } + } + + ??? + } + + + /** Sample the vectors */ + def sampleVectors(vectors: RDD[(LangIndex, HighScore)]): Array[(Int, Int)] = { + + assert(kmeansKernels % langs.length == 0, "kmeansKernels should be a multiple of the number of languages studied.") + val perLang = kmeansKernels / langs.length + + // http://en.wikipedia.org/wiki/Reservoir_sampling + def reservoirSampling(lang: Int, iter: Iterator[Int], size: Int): Array[Int] = { + val res = new Array[Int](size) + val rnd = new util.Random(lang) + + for (i <- 0 until size) { + assert(iter.hasNext, s"iterator must have at least $size elements") + res(i) = iter.next + } + + var i = size.toLong + while (iter.hasNext) { + val elt = iter.next + val j = math.abs(rnd.nextLong) % i + if (j < size) + res(j.toInt) = elt + i += 1 + } + + res + } + + val res = + if (langSpread < 500) + // sample the space regardless of the language + vectors.takeSample(false, kmeansKernels, 42) + else + // sample the space uniformly from each language partition + vectors.groupByKey.flatMap({ + case (lang, vectors) => reservoirSampling(lang, vectors.iterator, perLang).map((lang, _)) + }).collect() + + assert(res.length == kmeansKernels, res.length) + res + } + + + // + // + // Kmeans method: + // + // + + /** Main kmeans computation */ + @tailrec final def kmeans(means: Array[(Int, Int)], vectors: RDD[(Int, Int)], iter: Int = 1, debug: Boolean = false): Array[(Int, Int)] = { + val newMeans = means.clone() // you need to compute newMeans + + // TODO: Fill in the newMeans array + val distance = euclideanDistance(means, newMeans) + + if (debug) { + println(s"""Iteration: $iter + | * current distance: $distance + | * desired distance: $kmeansEta + | * means:""".stripMargin) + for (idx <- 0 until kmeansKernels) + println(f" ${means(idx).toString}%20s ==> ${newMeans(idx).toString}%20s " + + f" distance: ${euclideanDistance(means(idx), newMeans(idx))}%8.0f") + } + + if (converged(distance)) + newMeans + else if (iter < kmeansMaxIterations) + kmeans(newMeans, vectors, iter + 1, debug) + else { + if (debug) { + println("Reached max iterations!") + } + newMeans + } + } + + + + + // + // + // Kmeans utilities: + // + // + + /** Decide whether the kmeans clustering converged */ + def converged(distance: Double) = + distance < kmeansEta + + + /** Return the euclidean distance between two points */ + def euclideanDistance(v1: (Int, Int), v2: (Int, Int)): Double = { + val part1 = (v1._1 - v2._1).toDouble * (v1._1 - v2._1) + val part2 = (v1._2 - v2._2).toDouble * (v1._2 - v2._2) + part1 + part2 + } + + /** Return the euclidean distance between two points */ + def euclideanDistance(a1: Array[(Int, Int)], a2: Array[(Int, Int)]): Double = { + assert(a1.length == a2.length) + var sum = 0d + var idx = 0 + while(idx < a1.length) { + sum += euclideanDistance(a1(idx), a2(idx)) + idx += 1 + } + sum + } + + /** Return the closest point */ + def findClosest(p: (Int, Int), centers: Array[(Int, Int)]): Int = { + var bestIndex = 0 + var closest = Double.PositiveInfinity + for (i <- 0 until centers.length) { + val tempDist = euclideanDistance(p, centers(i)) + if (tempDist < closest) { + closest = tempDist + bestIndex = i + } + } + bestIndex + } + + + /** Average the vectors */ + def averageVectors(ps: Iterable[(Int, Int)]): (Int, Int) = { + val iter = ps.iterator + var count = 0 + var comp1: Long = 0 + var comp2: Long = 0 + while (iter.hasNext) { + val item = iter.next + comp1 += item._1 + comp2 += item._2 + count += 1 + } + ((comp1 / count).toInt, (comp2 / count).toInt) + } + + + + + // + // + // Displaying results: + // + // + def clusterResults(means: Array[(Int, Int)], vectors: RDD[(LangIndex, HighScore)]): Array[(String, Double, Int, Int)] = { + val closest = vectors.map(p => (findClosest(p, means), p)) + val closestGrouped = closest.groupByKey() + + val median = closestGrouped.mapValues { vs => + val langLabel: String = ??? // most common language in the cluster + val langPercent: Double = ??? // percent of the questions in the most common language + val clusterSize: Int = ??? + val medianScore: Int = ??? + + (langLabel, langPercent, clusterSize, medianScore) + } + + median.collect().map(_._2).sortBy(_._4) + } + + def printResults(results: Array[(String, Double, Int, Int)]): Unit = { + println("Resulting clusters:") + println(" Score Dominant language (%percent) Questions") + println("================================================") + for ((lang, percent, size, score) <- results) + println(f"${score}%7d ${lang}%-17s (${percent}%-5.1f%%) ${size}%7d") + } +} diff --git a/src/test/scala/stackoverflow/StackOverflowSuite.scala b/src/test/scala/stackoverflow/StackOverflowSuite.scala new file mode 100644 index 0000000..fb9a4f1 --- /dev/null +++ b/src/test/scala/stackoverflow/StackOverflowSuite.scala @@ -0,0 +1,47 @@ +package stackoverflow + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.RDD +import org.junit._ +import org.junit.Assert.assertEquals +import java.io.File +import scala.io.{ Codec, Source } +import scala.util.Properties.isWin + +object StackOverflowSuite { + if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4") + + val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow") + val sc: SparkContext = new SparkContext(conf) +} + +class StackOverflowSuite { + import StackOverflowSuite._ + + + lazy val testObject = new StackOverflow { + override val langs = + List( + "JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS", + "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy") + override def langSpread = 50000 + override def kmeansKernels = 45 + override def kmeansEta: Double = 20.0D + override def kmeansMaxIterations = 120 + } + + @Test def `testObject can be instantiated`: Unit = { + val instantiatable = try { + testObject + true + } catch { + case _: Throwable => false + } + assert(instantiatable, "Can't instantiate a StackOverflow object") + } + + + @Rule def individualTestTimeout = new org.junit.rules.Timeout(300 * 1000) +} diff --git a/winutils/hadoop-2.7.4/bin/winutils.exe b/winutils/hadoop-2.7.4/bin/winutils.exe new file mode 100644 index 0000000..9f30639 Binary files /dev/null and b/winutils/hadoop-2.7.4/bin/winutils.exe differ