Add stackoverflow assignment
This commit is contained in:
commit
0a94b65038
20
.gitignore
vendored
Normal file
20
.gitignore
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
# General
|
||||
*.DS_Store
|
||||
*.swp
|
||||
*~
|
||||
|
||||
# Dotty
|
||||
*.class
|
||||
*.tasty
|
||||
*.hasTasty
|
||||
|
||||
# sbt
|
||||
target/
|
||||
|
||||
# Dotty IDE
|
||||
/.dotty-ide-artifact
|
||||
/.dotty-ide.json
|
||||
|
||||
# datasets
|
||||
stackoverflow-grading.csv
|
||||
wikipedia-grading.dat
|
||||
36
.gitlab-ci.yml
Normal file
36
.gitlab-ci.yml
Normal file
@ -0,0 +1,36 @@
|
||||
# DO NOT EDIT THIS FILE
|
||||
|
||||
stages:
|
||||
- build
|
||||
- grade
|
||||
|
||||
compile:
|
||||
stage: build
|
||||
image: lampepfl/moocs:dotty-2020-02-12
|
||||
except:
|
||||
- tags
|
||||
tags:
|
||||
- cs206
|
||||
script:
|
||||
- sbt packageSubmission
|
||||
artifacts:
|
||||
expire_in: 1 day
|
||||
paths:
|
||||
- submission.jar
|
||||
|
||||
grade:
|
||||
stage: grade
|
||||
except:
|
||||
- tags
|
||||
tags:
|
||||
- cs206
|
||||
image:
|
||||
name: smarter3/moocs:bigdata-stackoverflow-2020-05-11-2
|
||||
entrypoint: [""]
|
||||
allow_failure: true
|
||||
before_script:
|
||||
- mkdir -p /shared/submission/
|
||||
- cp submission.jar /shared/submission/submission.jar
|
||||
script:
|
||||
- cd /grader
|
||||
- /grader/grade | /grader/feedback-printer
|
||||
8
.vscode/settings.json
vendored
Normal file
8
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
{
|
||||
"dotty": {
|
||||
"trace": {
|
||||
"remoteTracingUrl": "wss://lamppc36.epfl.ch/dotty-remote-tracer/upload/lsp.log",
|
||||
"server": { "format": "JSON", "verbosity": "verbose" }
|
||||
}
|
||||
}
|
||||
}
|
||||
9
assignment.sbt
Normal file
9
assignment.sbt
Normal file
@ -0,0 +1,9 @@
|
||||
// Student tasks (i.e. submit, packageSubmission)
|
||||
enablePlugins(StudentTasks)
|
||||
|
||||
courseraId := ch.epfl.lamp.CourseraId(
|
||||
key = "7ByAoS4kEea1yxIfJA1CUw",
|
||||
itemId = "QhzMw",
|
||||
premiumItemId = Some("FWGnz"),
|
||||
partId = "OY5fJ"
|
||||
)
|
||||
20
build.sbt
Normal file
20
build.sbt
Normal file
@ -0,0 +1,20 @@
|
||||
course := "bigdata"
|
||||
assignment := "stackoverflow"
|
||||
|
||||
scalaVersion := "0.24.0-RC1"
|
||||
scalacOptions ++= Seq("-language:implicitConversions", "-deprecation")
|
||||
libraryDependencies ++= Seq(
|
||||
"com.novocode" % "junit-interface" % "0.11" % Test,
|
||||
("org.apache.spark" %% "spark-core" % "3.0.0-X1").withDottyCompat(scalaVersion.value),
|
||||
)
|
||||
|
||||
// Contains Spark 3 snapshot built against 2.13: https://github.com/smarter/spark/tree/scala-2.13
|
||||
resolvers += Resolver.bintrayRepo("smarter", "maven")
|
||||
|
||||
testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s")
|
||||
|
||||
testSuite := "stackoverflow.StackOverflowSuite"
|
||||
|
||||
// Without forking, ctrl-c doesn't actually fully stop Spark
|
||||
fork in run := true
|
||||
fork in Test := true
|
||||
BIN
grading-tests.jar
Normal file
BIN
grading-tests.jar
Normal file
Binary file not shown.
46
project/MOOCSettings.scala
Normal file
46
project/MOOCSettings.scala
Normal file
@ -0,0 +1,46 @@
|
||||
package ch.epfl.lamp
|
||||
|
||||
import sbt._
|
||||
import sbt.Keys._
|
||||
|
||||
/**
|
||||
* Coursera uses two versions of each assignment. They both have the same assignment key and part id but have
|
||||
* different item ids.
|
||||
*
|
||||
* @param key Assignment key
|
||||
* @param partId Assignment partId
|
||||
* @param itemId Item id of the non premium version
|
||||
* @param premiumItemId Item id of the premium version (`None` if the assignment is optional)
|
||||
*/
|
||||
case class CourseraId(key: String, partId: String, itemId: String, premiumItemId: Option[String])
|
||||
|
||||
/**
|
||||
* Settings shared by all assignments, reused in various tasks.
|
||||
*/
|
||||
object MOOCSettings extends AutoPlugin {
|
||||
|
||||
object autoImport {
|
||||
val course = SettingKey[String]("course")
|
||||
val assignment = SettingKey[String]("assignment")
|
||||
val options = SettingKey[Map[String, Map[String, String]]]("options")
|
||||
val courseraId = settingKey[CourseraId]("Coursera-specific information identifying the assignment")
|
||||
val testSuite = settingKey[String]("Fully qualified name of the test suite of this assignment")
|
||||
// Convenient alias
|
||||
type CourseraId = ch.epfl.lamp.CourseraId
|
||||
val CourseraId = ch.epfl.lamp.CourseraId
|
||||
}
|
||||
|
||||
import autoImport._
|
||||
|
||||
override val globalSettings: Seq[Def.Setting[_]] = Seq(
|
||||
// supershell is verbose, buggy and useless.
|
||||
useSuperShell := false
|
||||
)
|
||||
|
||||
override val projectSettings: Seq[Def.Setting[_]] = Seq(
|
||||
parallelExecution in Test := false,
|
||||
// Report test result after each test instead of waiting for every test to finish
|
||||
logBuffered in Test := false,
|
||||
name := s"${course.value}-${assignment.value}"
|
||||
)
|
||||
}
|
||||
318
project/StudentTasks.scala
Normal file
318
project/StudentTasks.scala
Normal file
@ -0,0 +1,318 @@
|
||||
package ch.epfl.lamp
|
||||
|
||||
import sbt._
|
||||
import Keys._
|
||||
|
||||
// import scalaj.http._
|
||||
import java.io.{File, FileInputStream, IOException}
|
||||
import org.apache.commons.codec.binary.Base64
|
||||
// import play.api.libs.json.{Json, JsObject, JsPath}
|
||||
import scala.util.{Failure, Success, Try}
|
||||
|
||||
/**
|
||||
* Provides tasks for submitting the assignment
|
||||
*/
|
||||
object StudentTasks extends AutoPlugin {
|
||||
|
||||
override def requires = super.requires && MOOCSettings
|
||||
|
||||
object autoImport {
|
||||
val packageSourcesOnly = TaskKey[File]("packageSourcesOnly", "Package the sources of the project")
|
||||
val packageBinWithoutResources = TaskKey[File]("packageBinWithoutResources", "Like packageBin, but without the resources")
|
||||
val packageSubmissionZip = TaskKey[File]("packageSubmissionZip")
|
||||
val packageSubmission = inputKey[Unit]("package solution as an archive file")
|
||||
val runGradingTests = taskKey[Unit]("run black-box tests used for final grading")
|
||||
}
|
||||
|
||||
|
||||
import autoImport._
|
||||
import MOOCSettings.autoImport._
|
||||
|
||||
override lazy val projectSettings = Seq(
|
||||
packageSubmissionSetting,
|
||||
// submitSetting,
|
||||
runGradingTestsSettings,
|
||||
|
||||
fork := true,
|
||||
connectInput in run := true,
|
||||
outputStrategy := Some(StdoutOutput),
|
||||
) ++ packageSubmissionZipSettings
|
||||
|
||||
lazy val runGradingTestsSettings = runGradingTests := {
|
||||
val testSuiteJar = "grading-tests.jar"
|
||||
if (!new File(testSuiteJar).exists) {
|
||||
throw new MessageOnlyException(s"Could not find tests JarFile: $testSuiteJar")
|
||||
}
|
||||
|
||||
val classPath = s"${(Test / dependencyClasspath).value.map(_.data).mkString(File.pathSeparator)}${File.pathSeparator}$testSuiteJar"
|
||||
val junitProcess =
|
||||
Fork.java.fork(
|
||||
ForkOptions(),
|
||||
"-cp" :: classPath ::
|
||||
"org.junit.runner.JUnitCore" ::
|
||||
(Test / testSuite).value ::
|
||||
Nil
|
||||
)
|
||||
|
||||
// Wait for tests to complete.
|
||||
junitProcess.exitValue()
|
||||
}
|
||||
|
||||
|
||||
/** **********************************************************
|
||||
* SUBMITTING A SOLUTION TO COURSERA
|
||||
*/
|
||||
|
||||
val packageSubmissionZipSettings = Seq(
|
||||
packageSubmissionZip := {
|
||||
val submission = crossTarget.value / "submission.zip"
|
||||
val sources = (packageSourcesOnly in Compile).value
|
||||
val binaries = (packageBinWithoutResources in Compile).value
|
||||
IO.zip(Seq(sources -> "sources.zip", binaries -> "binaries.jar"), submission)
|
||||
submission
|
||||
},
|
||||
artifactClassifier in packageSourcesOnly := Some("sources"),
|
||||
artifact in (Compile, packageBinWithoutResources) ~= (art => art.withName(art.name + "-without-resources"))
|
||||
) ++
|
||||
inConfig(Compile)(
|
||||
Defaults.packageTaskSettings(packageSourcesOnly, Defaults.sourceMappings) ++
|
||||
Defaults.packageTaskSettings(packageBinWithoutResources, Def.task {
|
||||
val relativePaths =
|
||||
(unmanagedResources in Compile).value.flatMap(Path.relativeTo((unmanagedResourceDirectories in Compile).value)(_))
|
||||
(mappings in (Compile, packageBin)).value.filterNot { case (_, path) => relativePaths.contains(path) }
|
||||
})
|
||||
)
|
||||
|
||||
val maxSubmitFileSize = {
|
||||
val mb = 1024 * 1024
|
||||
10 * mb
|
||||
}
|
||||
|
||||
/** Check that the jar exists, isn't empty, isn't crazy big, and can be read
|
||||
* If so, encode jar as base64 so we can send it to Coursera
|
||||
*/
|
||||
def prepareJar(jar: File, s: TaskStreams): String = {
|
||||
val errPrefix = "Error submitting assignment jar: "
|
||||
val fileLength = jar.length()
|
||||
if (!jar.exists()) {
|
||||
s.log.error(errPrefix + "jar archive does not exist\n" + jar.getAbsolutePath)
|
||||
failSubmit()
|
||||
} else if (fileLength == 0L) {
|
||||
s.log.error(errPrefix + "jar archive is empty\n" + jar.getAbsolutePath)
|
||||
failSubmit()
|
||||
} else if (fileLength > maxSubmitFileSize) {
|
||||
s.log.error(errPrefix + "jar archive is too big. Allowed size: " +
|
||||
maxSubmitFileSize + " bytes, found " + fileLength + " bytes.\n" +
|
||||
jar.getAbsolutePath)
|
||||
failSubmit()
|
||||
} else {
|
||||
val bytes = new Array[Byte](fileLength.toInt)
|
||||
val sizeRead = try {
|
||||
val is = new FileInputStream(jar)
|
||||
val read = is.read(bytes)
|
||||
is.close()
|
||||
read
|
||||
} catch {
|
||||
case ex: IOException =>
|
||||
s.log.error(errPrefix + "failed to read sources jar archive\n" + ex.toString)
|
||||
failSubmit()
|
||||
}
|
||||
if (sizeRead != bytes.length) {
|
||||
s.log.error(errPrefix + "failed to read the sources jar archive, size read: " + sizeRead)
|
||||
failSubmit()
|
||||
} else encodeBase64(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
/** Task to package solution to a given file path */
|
||||
lazy val packageSubmissionSetting = packageSubmission := {
|
||||
val args: Seq[String] = Def.spaceDelimited("[path]").parsed
|
||||
val s: TaskStreams = streams.value // for logging
|
||||
val jar = (packageSubmissionZip in Compile).value
|
||||
|
||||
val base64Jar = prepareJar(jar, s)
|
||||
|
||||
val path = args.headOption.getOrElse((baseDirectory.value / "submission.jar").absolutePath)
|
||||
scala.tools.nsc.io.File(path).writeAll(base64Jar)
|
||||
}
|
||||
|
||||
/*
|
||||
/** Task to submit a solution to coursera */
|
||||
val submit = inputKey[Unit]("submit solution to Coursera")
|
||||
lazy val submitSetting = submit := {
|
||||
// Fail if scalafix linting does not pass.
|
||||
scalafixLinting.value
|
||||
|
||||
val args: Seq[String] = Def.spaceDelimited("<arg>").parsed
|
||||
val s: TaskStreams = streams.value // for logging
|
||||
val jar = (packageSubmissionZip in Compile).value
|
||||
|
||||
val assignmentDetails =
|
||||
courseraId.?.value.getOrElse(throw new MessageOnlyException("This assignment can not be submitted to Coursera because the `courseraId` setting is undefined"))
|
||||
val assignmentKey = assignmentDetails.key
|
||||
val courseName =
|
||||
course.value match {
|
||||
case "capstone" => "scala-capstone"
|
||||
case "bigdata" => "scala-spark-big-data"
|
||||
case other => other
|
||||
}
|
||||
|
||||
val partId = assignmentDetails.partId
|
||||
val itemId = assignmentDetails.itemId
|
||||
val premiumItemId = assignmentDetails.premiumItemId
|
||||
|
||||
val (email, secret) = args match {
|
||||
case email :: secret :: Nil =>
|
||||
(email, secret)
|
||||
case _ =>
|
||||
val inputErr =
|
||||
s"""|Invalid input to `submit`. The required syntax for `submit` is:
|
||||
|submit <email-address> <submit-token>
|
||||
|
|
||||
|The submit token is NOT YOUR LOGIN PASSWORD.
|
||||
|It can be obtained from the assignment page:
|
||||
|https://www.coursera.org/learn/$courseName/programming/$itemId
|
||||
|${
|
||||
premiumItemId.fold("") { id =>
|
||||
s"""or (for premium learners):
|
||||
|https://www.coursera.org/learn/$courseName/programming/$id
|
||||
""".stripMargin
|
||||
}
|
||||
}
|
||||
""".stripMargin
|
||||
s.log.error(inputErr)
|
||||
failSubmit()
|
||||
}
|
||||
|
||||
val base64Jar = prepareJar(jar, s)
|
||||
val json =
|
||||
s"""|{
|
||||
| "assignmentKey":"$assignmentKey",
|
||||
| "submitterEmail":"$email",
|
||||
| "secret":"$secret",
|
||||
| "parts":{
|
||||
| "$partId":{
|
||||
| "output":"$base64Jar"
|
||||
| }
|
||||
| }
|
||||
|}""".stripMargin
|
||||
|
||||
def postSubmission[T](data: String): Try[HttpResponse[String]] = {
|
||||
val http = Http("https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1")
|
||||
val hs = List(
|
||||
("Cache-Control", "no-cache"),
|
||||
("Content-Type", "application/json")
|
||||
)
|
||||
s.log.info("Connecting to Coursera...")
|
||||
val response = Try(http.postData(data)
|
||||
.headers(hs)
|
||||
.option(HttpOptions.connTimeout(10000)) // scalaj default timeout is only 100ms, changing that to 10s
|
||||
.asString) // kick off HTTP POST
|
||||
response
|
||||
}
|
||||
|
||||
val connectMsg =
|
||||
s"""|Attempting to submit "${assignment.value}" assignment in "$courseName" course
|
||||
|Using:
|
||||
|- email: $email
|
||||
|- submit token: $secret""".stripMargin
|
||||
s.log.info(connectMsg)
|
||||
|
||||
def reportCourseraResponse(response: HttpResponse[String]): Unit = {
|
||||
val code = response.code
|
||||
val respBody = response.body
|
||||
|
||||
/* Sample JSON response from Coursera
|
||||
{
|
||||
"message": "Invalid email or token.",
|
||||
"details": {
|
||||
"learnerMessage": "Invalid email or token."
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// Success, Coursera responds with 2xx HTTP status code
|
||||
if (response.is2xx) {
|
||||
val successfulSubmitMsg =
|
||||
s"""|Successfully connected to Coursera. (Status $code)
|
||||
|
|
||||
|Assignment submitted successfully!
|
||||
|
|
||||
|You can see how you scored by going to:
|
||||
|https://www.coursera.org/learn/$courseName/programming/$itemId/
|
||||
|${
|
||||
premiumItemId.fold("") { id =>
|
||||
s"""or (for premium learners):
|
||||
|https://www.coursera.org/learn/$courseName/programming/$id
|
||||
""".stripMargin
|
||||
}
|
||||
}
|
||||
|and clicking on "My Submission".""".stripMargin
|
||||
s.log.info(successfulSubmitMsg)
|
||||
}
|
||||
|
||||
// Failure, Coursera responds with 4xx HTTP status code (client-side failure)
|
||||
else if (response.is4xx) {
|
||||
val result = Try(Json.parse(respBody)).toOption
|
||||
val learnerMsg = result match {
|
||||
case Some(resp: JsObject) =>
|
||||
(JsPath \ "details" \ "learnerMessage").read[String].reads(resp).get
|
||||
case Some(x) => // shouldn't happen
|
||||
"Could not parse Coursera's response:\n" + x
|
||||
case None =>
|
||||
"Could not parse Coursera's response:\n" + respBody
|
||||
}
|
||||
val failedSubmitMsg =
|
||||
s"""|Submission failed.
|
||||
|There was something wrong while attempting to submit.
|
||||
|Coursera says:
|
||||
|$learnerMsg (Status $code)""".stripMargin
|
||||
s.log.error(failedSubmitMsg)
|
||||
}
|
||||
|
||||
// Failure, Coursera responds with 5xx HTTP status code (server-side failure)
|
||||
else if (response.is5xx) {
|
||||
val failedSubmitMsg =
|
||||
s"""|Submission failed.
|
||||
|Coursera seems to be unavailable at the moment (Status $code)
|
||||
|Check https://status.coursera.org/ and try again in a few minutes.
|
||||
""".stripMargin
|
||||
s.log.error(failedSubmitMsg)
|
||||
}
|
||||
|
||||
// Failure, Coursera repsonds with an unexpected status code
|
||||
else {
|
||||
val failedSubmitMsg =
|
||||
s"""|Submission failed.
|
||||
|Coursera replied with an unexpected code (Status $code)
|
||||
""".stripMargin
|
||||
s.log.error(failedSubmitMsg)
|
||||
}
|
||||
}
|
||||
|
||||
// kick it all off, actually make request
|
||||
postSubmission(json) match {
|
||||
case Success(resp) => reportCourseraResponse(resp)
|
||||
case Failure(e) =>
|
||||
val failedConnectMsg =
|
||||
s"""|Connection to Coursera failed.
|
||||
|There was something wrong while attempting to connect to Coursera.
|
||||
|Check your internet connection.
|
||||
|${e.toString}""".stripMargin
|
||||
s.log.error(failedConnectMsg)
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
def failSubmit(): Nothing = {
|
||||
sys.error("Submission failed")
|
||||
}
|
||||
|
||||
/**
|
||||
* *****************
|
||||
* DEALING WITH JARS
|
||||
*/
|
||||
def encodeBase64(bytes: Array[Byte]): String =
|
||||
new String(Base64.encodeBase64(bytes))
|
||||
}
|
||||
1
project/build.properties
Normal file
1
project/build.properties
Normal file
@ -0,0 +1 @@
|
||||
sbt.version=1.3.8
|
||||
5
project/buildSettings.sbt
Normal file
5
project/buildSettings.sbt
Normal file
@ -0,0 +1,5 @@
|
||||
// Used for Coursera submission (StudentPlugin)
|
||||
// libraryDependencies += "org.scalaj" %% "scalaj-http" % "2.4.2"
|
||||
// libraryDependencies += "com.typesafe.play" %% "play-json" % "2.7.4"
|
||||
// Used for Base64 (StudentPlugin)
|
||||
libraryDependencies += "commons-codec" % "commons-codec" % "1.10"
|
||||
2
project/plugins.sbt
Normal file
2
project/plugins.sbt
Normal file
@ -0,0 +1,2 @@
|
||||
addSbtPlugin("org.scala-js" % "sbt-scalajs" % "0.6.28")
|
||||
addSbtPlugin("ch.epfl.lamp" % "sbt-dotty" % "0.4.0")
|
||||
0
src/main/resources/stackoverflow/.keep
Normal file
0
src/main/resources/stackoverflow/.keep
Normal file
308
src/main/scala/stackoverflow/StackOverflow.scala
Normal file
308
src/main/scala/stackoverflow/StackOverflow.scala
Normal file
@ -0,0 +1,308 @@
|
||||
package stackoverflow
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.SparkContext._
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.log4j.{Logger, Level}
|
||||
|
||||
import annotation.tailrec
|
||||
import scala.reflect.ClassTag
|
||||
import scala.util.Properties.isWin
|
||||
|
||||
type Question = Posting
|
||||
type Answer = Posting
|
||||
type QID = Int
|
||||
type HighScore = Int
|
||||
type LangIndex = Int
|
||||
|
||||
/** A raw stackoverflow posting, either a question or an answer */
|
||||
case class Posting(postingType: Int, id: Int, acceptedAnswer: Option[Int], parentId: Option[QID], score: Int, tags: Option[String]) extends Serializable
|
||||
|
||||
/** The main class */
|
||||
object StackOverflow extends StackOverflow {
|
||||
|
||||
// Reduce Spark logging verbosity
|
||||
Logger.getLogger("org").setLevel(Level.ERROR)
|
||||
|
||||
if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
|
||||
|
||||
@transient lazy val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
|
||||
@transient lazy val sc: SparkContext = new SparkContext(conf)
|
||||
|
||||
/** Main function */
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val lines = sc.textFile("src/main/resources/stackoverflow/stackoverflow-grading.csv")
|
||||
val raw = rawPostings(lines)
|
||||
val grouped = groupedPostings(raw)
|
||||
val scored = scoredPostings(grouped)
|
||||
val vectors = vectorPostings(scored)
|
||||
// assert(vectors.count() == 2121822, "Incorrect number of vectors: " + vectors.count())
|
||||
|
||||
val means = kmeans(sampleVectors(vectors), vectors, debug = true)
|
||||
val results = clusterResults(means, vectors)
|
||||
printResults(results)
|
||||
}
|
||||
}
|
||||
|
||||
/** The parsing and kmeans methods */
|
||||
class StackOverflow extends Serializable {
|
||||
|
||||
/** Languages */
|
||||
val langs =
|
||||
List(
|
||||
"JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
|
||||
"Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
|
||||
|
||||
/** K-means parameter: How "far apart" languages should be for the kmeans algorithm? */
|
||||
def langSpread = 50000
|
||||
assert(langSpread > 0, "If langSpread is zero we can't recover the language from the input data!")
|
||||
|
||||
/** K-means parameter: Number of clusters */
|
||||
def kmeansKernels = 45
|
||||
|
||||
/** K-means parameter: Convergence criteria */
|
||||
def kmeansEta: Double = 20.0D
|
||||
|
||||
/** K-means parameter: Maximum iterations */
|
||||
def kmeansMaxIterations = 120
|
||||
|
||||
|
||||
//
|
||||
//
|
||||
// Parsing utilities:
|
||||
//
|
||||
//
|
||||
|
||||
/** Load postings from the given file */
|
||||
def rawPostings(lines: RDD[String]): RDD[Posting] =
|
||||
lines.map(line => {
|
||||
val arr = line.split(",")
|
||||
Posting(postingType = arr(0).toInt,
|
||||
id = arr(1).toInt,
|
||||
acceptedAnswer = if (arr(2) == "") None else Some(arr(2).toInt),
|
||||
parentId = if (arr(3) == "") None else Some(arr(3).toInt),
|
||||
score = arr(4).toInt,
|
||||
tags = if (arr.length >= 6) Some(arr(5).intern()) else None)
|
||||
})
|
||||
|
||||
|
||||
/** Group the questions and answers together */
|
||||
def groupedPostings(postings: RDD[Posting]): RDD[(QID, Iterable[(Question, Answer)])] = {
|
||||
???
|
||||
}
|
||||
|
||||
|
||||
/** Compute the maximum score for each posting */
|
||||
def scoredPostings(grouped: RDD[(QID, Iterable[(Question, Answer)])]): RDD[(Question, HighScore)] = {
|
||||
|
||||
def answerHighScore(as: Array[Answer]): HighScore = {
|
||||
var highScore = 0
|
||||
var i = 0
|
||||
while (i < as.length) {
|
||||
val score = as(i).score
|
||||
if (score > highScore)
|
||||
highScore = score
|
||||
i += 1
|
||||
}
|
||||
highScore
|
||||
}
|
||||
|
||||
???
|
||||
}
|
||||
|
||||
|
||||
/** Compute the vectors for the kmeans */
|
||||
def vectorPostings(scored: RDD[(Question, HighScore)]): RDD[(LangIndex, HighScore)] = {
|
||||
/** Return optional index of first language that occurs in `tags`. */
|
||||
def firstLangInTag(tag: Option[String], ls: List[String]): Option[Int] = {
|
||||
if (tag.isEmpty) None
|
||||
else if (ls.isEmpty) None
|
||||
else if (tag.get == ls.head) Some(0) // index: 0
|
||||
else {
|
||||
val tmp = firstLangInTag(tag, ls.tail)
|
||||
tmp match {
|
||||
case None => None
|
||||
case Some(i) => Some(i + 1) // index i in ls.tail => index i+1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
???
|
||||
}
|
||||
|
||||
|
||||
/** Sample the vectors */
|
||||
def sampleVectors(vectors: RDD[(LangIndex, HighScore)]): Array[(Int, Int)] = {
|
||||
|
||||
assert(kmeansKernels % langs.length == 0, "kmeansKernels should be a multiple of the number of languages studied.")
|
||||
val perLang = kmeansKernels / langs.length
|
||||
|
||||
// http://en.wikipedia.org/wiki/Reservoir_sampling
|
||||
def reservoirSampling(lang: Int, iter: Iterator[Int], size: Int): Array[Int] = {
|
||||
val res = new Array[Int](size)
|
||||
val rnd = new util.Random(lang)
|
||||
|
||||
for (i <- 0 until size) {
|
||||
assert(iter.hasNext, s"iterator must have at least $size elements")
|
||||
res(i) = iter.next
|
||||
}
|
||||
|
||||
var i = size.toLong
|
||||
while (iter.hasNext) {
|
||||
val elt = iter.next
|
||||
val j = math.abs(rnd.nextLong) % i
|
||||
if (j < size)
|
||||
res(j.toInt) = elt
|
||||
i += 1
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
val res =
|
||||
if (langSpread < 500)
|
||||
// sample the space regardless of the language
|
||||
vectors.takeSample(false, kmeansKernels, 42)
|
||||
else
|
||||
// sample the space uniformly from each language partition
|
||||
vectors.groupByKey.flatMap({
|
||||
case (lang, vectors) => reservoirSampling(lang, vectors.iterator, perLang).map((lang, _))
|
||||
}).collect()
|
||||
|
||||
assert(res.length == kmeansKernels, res.length)
|
||||
res
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
//
|
||||
// Kmeans method:
|
||||
//
|
||||
//
|
||||
|
||||
/** Main kmeans computation */
|
||||
@tailrec final def kmeans(means: Array[(Int, Int)], vectors: RDD[(Int, Int)], iter: Int = 1, debug: Boolean = false): Array[(Int, Int)] = {
|
||||
val newMeans = means.clone() // you need to compute newMeans
|
||||
|
||||
// TODO: Fill in the newMeans array
|
||||
val distance = euclideanDistance(means, newMeans)
|
||||
|
||||
if (debug) {
|
||||
println(s"""Iteration: $iter
|
||||
| * current distance: $distance
|
||||
| * desired distance: $kmeansEta
|
||||
| * means:""".stripMargin)
|
||||
for (idx <- 0 until kmeansKernels)
|
||||
println(f" ${means(idx).toString}%20s ==> ${newMeans(idx).toString}%20s " +
|
||||
f" distance: ${euclideanDistance(means(idx), newMeans(idx))}%8.0f")
|
||||
}
|
||||
|
||||
if (converged(distance))
|
||||
newMeans
|
||||
else if (iter < kmeansMaxIterations)
|
||||
kmeans(newMeans, vectors, iter + 1, debug)
|
||||
else {
|
||||
if (debug) {
|
||||
println("Reached max iterations!")
|
||||
}
|
||||
newMeans
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
//
|
||||
// Kmeans utilities:
|
||||
//
|
||||
//
|
||||
|
||||
/** Decide whether the kmeans clustering converged */
|
||||
def converged(distance: Double) =
|
||||
distance < kmeansEta
|
||||
|
||||
|
||||
/** Return the euclidean distance between two points */
|
||||
def euclideanDistance(v1: (Int, Int), v2: (Int, Int)): Double = {
|
||||
val part1 = (v1._1 - v2._1).toDouble * (v1._1 - v2._1)
|
||||
val part2 = (v1._2 - v2._2).toDouble * (v1._2 - v2._2)
|
||||
part1 + part2
|
||||
}
|
||||
|
||||
/** Return the euclidean distance between two points */
|
||||
def euclideanDistance(a1: Array[(Int, Int)], a2: Array[(Int, Int)]): Double = {
|
||||
assert(a1.length == a2.length)
|
||||
var sum = 0d
|
||||
var idx = 0
|
||||
while(idx < a1.length) {
|
||||
sum += euclideanDistance(a1(idx), a2(idx))
|
||||
idx += 1
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/** Return the closest point */
|
||||
def findClosest(p: (Int, Int), centers: Array[(Int, Int)]): Int = {
|
||||
var bestIndex = 0
|
||||
var closest = Double.PositiveInfinity
|
||||
for (i <- 0 until centers.length) {
|
||||
val tempDist = euclideanDistance(p, centers(i))
|
||||
if (tempDist < closest) {
|
||||
closest = tempDist
|
||||
bestIndex = i
|
||||
}
|
||||
}
|
||||
bestIndex
|
||||
}
|
||||
|
||||
|
||||
/** Average the vectors */
|
||||
def averageVectors(ps: Iterable[(Int, Int)]): (Int, Int) = {
|
||||
val iter = ps.iterator
|
||||
var count = 0
|
||||
var comp1: Long = 0
|
||||
var comp2: Long = 0
|
||||
while (iter.hasNext) {
|
||||
val item = iter.next
|
||||
comp1 += item._1
|
||||
comp2 += item._2
|
||||
count += 1
|
||||
}
|
||||
((comp1 / count).toInt, (comp2 / count).toInt)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
//
|
||||
// Displaying results:
|
||||
//
|
||||
//
|
||||
def clusterResults(means: Array[(Int, Int)], vectors: RDD[(LangIndex, HighScore)]): Array[(String, Double, Int, Int)] = {
|
||||
val closest = vectors.map(p => (findClosest(p, means), p))
|
||||
val closestGrouped = closest.groupByKey()
|
||||
|
||||
val median = closestGrouped.mapValues { vs =>
|
||||
val langLabel: String = ??? // most common language in the cluster
|
||||
val langPercent: Double = ??? // percent of the questions in the most common language
|
||||
val clusterSize: Int = ???
|
||||
val medianScore: Int = ???
|
||||
|
||||
(langLabel, langPercent, clusterSize, medianScore)
|
||||
}
|
||||
|
||||
median.collect().map(_._2).sortBy(_._4)
|
||||
}
|
||||
|
||||
def printResults(results: Array[(String, Double, Int, Int)]): Unit = {
|
||||
println("Resulting clusters:")
|
||||
println(" Score Dominant language (%percent) Questions")
|
||||
println("================================================")
|
||||
for ((lang, percent, size, score) <- results)
|
||||
println(f"${score}%7d ${lang}%-17s (${percent}%-5.1f%%) ${size}%7d")
|
||||
}
|
||||
}
|
||||
47
src/test/scala/stackoverflow/StackOverflowSuite.scala
Normal file
47
src/test/scala/stackoverflow/StackOverflowSuite.scala
Normal file
@ -0,0 +1,47 @@
|
||||
package stackoverflow
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.SparkContext._
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.junit._
|
||||
import org.junit.Assert.assertEquals
|
||||
import java.io.File
|
||||
import scala.io.{ Codec, Source }
|
||||
import scala.util.Properties.isWin
|
||||
|
||||
object StackOverflowSuite {
|
||||
if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
|
||||
|
||||
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
|
||||
val sc: SparkContext = new SparkContext(conf)
|
||||
}
|
||||
|
||||
class StackOverflowSuite {
|
||||
import StackOverflowSuite._
|
||||
|
||||
|
||||
lazy val testObject = new StackOverflow {
|
||||
override val langs =
|
||||
List(
|
||||
"JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
|
||||
"Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
|
||||
override def langSpread = 50000
|
||||
override def kmeansKernels = 45
|
||||
override def kmeansEta: Double = 20.0D
|
||||
override def kmeansMaxIterations = 120
|
||||
}
|
||||
|
||||
@Test def `testObject can be instantiated`: Unit = {
|
||||
val instantiatable = try {
|
||||
testObject
|
||||
true
|
||||
} catch {
|
||||
case _: Throwable => false
|
||||
}
|
||||
assert(instantiatable, "Can't instantiate a StackOverflow object")
|
||||
}
|
||||
|
||||
|
||||
@Rule def individualTestTimeout = new org.junit.rules.Timeout(300 * 1000)
|
||||
}
|
||||
BIN
winutils/hadoop-2.7.4/bin/winutils.exe
Normal file
BIN
winutils/hadoop-2.7.4/bin/winutils.exe
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user