Add stackoverflow assignment
This commit is contained in:
commit
0a94b65038
20
.gitignore
vendored
Normal file
20
.gitignore
vendored
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# General
|
||||||
|
*.DS_Store
|
||||||
|
*.swp
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Dotty
|
||||||
|
*.class
|
||||||
|
*.tasty
|
||||||
|
*.hasTasty
|
||||||
|
|
||||||
|
# sbt
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Dotty IDE
|
||||||
|
/.dotty-ide-artifact
|
||||||
|
/.dotty-ide.json
|
||||||
|
|
||||||
|
# datasets
|
||||||
|
stackoverflow-grading.csv
|
||||||
|
wikipedia-grading.dat
|
||||||
36
.gitlab-ci.yml
Normal file
36
.gitlab-ci.yml
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# DO NOT EDIT THIS FILE
|
||||||
|
|
||||||
|
stages:
|
||||||
|
- build
|
||||||
|
- grade
|
||||||
|
|
||||||
|
compile:
|
||||||
|
stage: build
|
||||||
|
image: lampepfl/moocs:dotty-2020-02-12
|
||||||
|
except:
|
||||||
|
- tags
|
||||||
|
tags:
|
||||||
|
- cs206
|
||||||
|
script:
|
||||||
|
- sbt packageSubmission
|
||||||
|
artifacts:
|
||||||
|
expire_in: 1 day
|
||||||
|
paths:
|
||||||
|
- submission.jar
|
||||||
|
|
||||||
|
grade:
|
||||||
|
stage: grade
|
||||||
|
except:
|
||||||
|
- tags
|
||||||
|
tags:
|
||||||
|
- cs206
|
||||||
|
image:
|
||||||
|
name: smarter3/moocs:bigdata-stackoverflow-2020-05-11-2
|
||||||
|
entrypoint: [""]
|
||||||
|
allow_failure: true
|
||||||
|
before_script:
|
||||||
|
- mkdir -p /shared/submission/
|
||||||
|
- cp submission.jar /shared/submission/submission.jar
|
||||||
|
script:
|
||||||
|
- cd /grader
|
||||||
|
- /grader/grade | /grader/feedback-printer
|
||||||
8
.vscode/settings.json
vendored
Normal file
8
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"dotty": {
|
||||||
|
"trace": {
|
||||||
|
"remoteTracingUrl": "wss://lamppc36.epfl.ch/dotty-remote-tracer/upload/lsp.log",
|
||||||
|
"server": { "format": "JSON", "verbosity": "verbose" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
9
assignment.sbt
Normal file
9
assignment.sbt
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
// Student tasks (i.e. submit, packageSubmission)
|
||||||
|
enablePlugins(StudentTasks)
|
||||||
|
|
||||||
|
courseraId := ch.epfl.lamp.CourseraId(
|
||||||
|
key = "7ByAoS4kEea1yxIfJA1CUw",
|
||||||
|
itemId = "QhzMw",
|
||||||
|
premiumItemId = Some("FWGnz"),
|
||||||
|
partId = "OY5fJ"
|
||||||
|
)
|
||||||
20
build.sbt
Normal file
20
build.sbt
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
course := "bigdata"
|
||||||
|
assignment := "stackoverflow"
|
||||||
|
|
||||||
|
scalaVersion := "0.24.0-RC1"
|
||||||
|
scalacOptions ++= Seq("-language:implicitConversions", "-deprecation")
|
||||||
|
libraryDependencies ++= Seq(
|
||||||
|
"com.novocode" % "junit-interface" % "0.11" % Test,
|
||||||
|
("org.apache.spark" %% "spark-core" % "3.0.0-X1").withDottyCompat(scalaVersion.value),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Contains Spark 3 snapshot built against 2.13: https://github.com/smarter/spark/tree/scala-2.13
|
||||||
|
resolvers += Resolver.bintrayRepo("smarter", "maven")
|
||||||
|
|
||||||
|
testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s")
|
||||||
|
|
||||||
|
testSuite := "stackoverflow.StackOverflowSuite"
|
||||||
|
|
||||||
|
// Without forking, ctrl-c doesn't actually fully stop Spark
|
||||||
|
fork in run := true
|
||||||
|
fork in Test := true
|
||||||
BIN
grading-tests.jar
Normal file
BIN
grading-tests.jar
Normal file
Binary file not shown.
46
project/MOOCSettings.scala
Normal file
46
project/MOOCSettings.scala
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
package ch.epfl.lamp
|
||||||
|
|
||||||
|
import sbt._
|
||||||
|
import sbt.Keys._
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Coursera uses two versions of each assignment. They both have the same assignment key and part id but have
|
||||||
|
* different item ids.
|
||||||
|
*
|
||||||
|
* @param key Assignment key
|
||||||
|
* @param partId Assignment partId
|
||||||
|
* @param itemId Item id of the non premium version
|
||||||
|
* @param premiumItemId Item id of the premium version (`None` if the assignment is optional)
|
||||||
|
*/
|
||||||
|
case class CourseraId(key: String, partId: String, itemId: String, premiumItemId: Option[String])
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Settings shared by all assignments, reused in various tasks.
|
||||||
|
*/
|
||||||
|
object MOOCSettings extends AutoPlugin {
|
||||||
|
|
||||||
|
object autoImport {
|
||||||
|
val course = SettingKey[String]("course")
|
||||||
|
val assignment = SettingKey[String]("assignment")
|
||||||
|
val options = SettingKey[Map[String, Map[String, String]]]("options")
|
||||||
|
val courseraId = settingKey[CourseraId]("Coursera-specific information identifying the assignment")
|
||||||
|
val testSuite = settingKey[String]("Fully qualified name of the test suite of this assignment")
|
||||||
|
// Convenient alias
|
||||||
|
type CourseraId = ch.epfl.lamp.CourseraId
|
||||||
|
val CourseraId = ch.epfl.lamp.CourseraId
|
||||||
|
}
|
||||||
|
|
||||||
|
import autoImport._
|
||||||
|
|
||||||
|
override val globalSettings: Seq[Def.Setting[_]] = Seq(
|
||||||
|
// supershell is verbose, buggy and useless.
|
||||||
|
useSuperShell := false
|
||||||
|
)
|
||||||
|
|
||||||
|
override val projectSettings: Seq[Def.Setting[_]] = Seq(
|
||||||
|
parallelExecution in Test := false,
|
||||||
|
// Report test result after each test instead of waiting for every test to finish
|
||||||
|
logBuffered in Test := false,
|
||||||
|
name := s"${course.value}-${assignment.value}"
|
||||||
|
)
|
||||||
|
}
|
||||||
318
project/StudentTasks.scala
Normal file
318
project/StudentTasks.scala
Normal file
@ -0,0 +1,318 @@
|
|||||||
|
package ch.epfl.lamp
|
||||||
|
|
||||||
|
import sbt._
|
||||||
|
import Keys._
|
||||||
|
|
||||||
|
// import scalaj.http._
|
||||||
|
import java.io.{File, FileInputStream, IOException}
|
||||||
|
import org.apache.commons.codec.binary.Base64
|
||||||
|
// import play.api.libs.json.{Json, JsObject, JsPath}
|
||||||
|
import scala.util.{Failure, Success, Try}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides tasks for submitting the assignment
|
||||||
|
*/
|
||||||
|
object StudentTasks extends AutoPlugin {
|
||||||
|
|
||||||
|
override def requires = super.requires && MOOCSettings
|
||||||
|
|
||||||
|
object autoImport {
|
||||||
|
val packageSourcesOnly = TaskKey[File]("packageSourcesOnly", "Package the sources of the project")
|
||||||
|
val packageBinWithoutResources = TaskKey[File]("packageBinWithoutResources", "Like packageBin, but without the resources")
|
||||||
|
val packageSubmissionZip = TaskKey[File]("packageSubmissionZip")
|
||||||
|
val packageSubmission = inputKey[Unit]("package solution as an archive file")
|
||||||
|
val runGradingTests = taskKey[Unit]("run black-box tests used for final grading")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
import autoImport._
|
||||||
|
import MOOCSettings.autoImport._
|
||||||
|
|
||||||
|
override lazy val projectSettings = Seq(
|
||||||
|
packageSubmissionSetting,
|
||||||
|
// submitSetting,
|
||||||
|
runGradingTestsSettings,
|
||||||
|
|
||||||
|
fork := true,
|
||||||
|
connectInput in run := true,
|
||||||
|
outputStrategy := Some(StdoutOutput),
|
||||||
|
) ++ packageSubmissionZipSettings
|
||||||
|
|
||||||
|
lazy val runGradingTestsSettings = runGradingTests := {
|
||||||
|
val testSuiteJar = "grading-tests.jar"
|
||||||
|
if (!new File(testSuiteJar).exists) {
|
||||||
|
throw new MessageOnlyException(s"Could not find tests JarFile: $testSuiteJar")
|
||||||
|
}
|
||||||
|
|
||||||
|
val classPath = s"${(Test / dependencyClasspath).value.map(_.data).mkString(File.pathSeparator)}${File.pathSeparator}$testSuiteJar"
|
||||||
|
val junitProcess =
|
||||||
|
Fork.java.fork(
|
||||||
|
ForkOptions(),
|
||||||
|
"-cp" :: classPath ::
|
||||||
|
"org.junit.runner.JUnitCore" ::
|
||||||
|
(Test / testSuite).value ::
|
||||||
|
Nil
|
||||||
|
)
|
||||||
|
|
||||||
|
// Wait for tests to complete.
|
||||||
|
junitProcess.exitValue()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** **********************************************************
|
||||||
|
* SUBMITTING A SOLUTION TO COURSERA
|
||||||
|
*/
|
||||||
|
|
||||||
|
val packageSubmissionZipSettings = Seq(
|
||||||
|
packageSubmissionZip := {
|
||||||
|
val submission = crossTarget.value / "submission.zip"
|
||||||
|
val sources = (packageSourcesOnly in Compile).value
|
||||||
|
val binaries = (packageBinWithoutResources in Compile).value
|
||||||
|
IO.zip(Seq(sources -> "sources.zip", binaries -> "binaries.jar"), submission)
|
||||||
|
submission
|
||||||
|
},
|
||||||
|
artifactClassifier in packageSourcesOnly := Some("sources"),
|
||||||
|
artifact in (Compile, packageBinWithoutResources) ~= (art => art.withName(art.name + "-without-resources"))
|
||||||
|
) ++
|
||||||
|
inConfig(Compile)(
|
||||||
|
Defaults.packageTaskSettings(packageSourcesOnly, Defaults.sourceMappings) ++
|
||||||
|
Defaults.packageTaskSettings(packageBinWithoutResources, Def.task {
|
||||||
|
val relativePaths =
|
||||||
|
(unmanagedResources in Compile).value.flatMap(Path.relativeTo((unmanagedResourceDirectories in Compile).value)(_))
|
||||||
|
(mappings in (Compile, packageBin)).value.filterNot { case (_, path) => relativePaths.contains(path) }
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
val maxSubmitFileSize = {
|
||||||
|
val mb = 1024 * 1024
|
||||||
|
10 * mb
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Check that the jar exists, isn't empty, isn't crazy big, and can be read
|
||||||
|
* If so, encode jar as base64 so we can send it to Coursera
|
||||||
|
*/
|
||||||
|
def prepareJar(jar: File, s: TaskStreams): String = {
|
||||||
|
val errPrefix = "Error submitting assignment jar: "
|
||||||
|
val fileLength = jar.length()
|
||||||
|
if (!jar.exists()) {
|
||||||
|
s.log.error(errPrefix + "jar archive does not exist\n" + jar.getAbsolutePath)
|
||||||
|
failSubmit()
|
||||||
|
} else if (fileLength == 0L) {
|
||||||
|
s.log.error(errPrefix + "jar archive is empty\n" + jar.getAbsolutePath)
|
||||||
|
failSubmit()
|
||||||
|
} else if (fileLength > maxSubmitFileSize) {
|
||||||
|
s.log.error(errPrefix + "jar archive is too big. Allowed size: " +
|
||||||
|
maxSubmitFileSize + " bytes, found " + fileLength + " bytes.\n" +
|
||||||
|
jar.getAbsolutePath)
|
||||||
|
failSubmit()
|
||||||
|
} else {
|
||||||
|
val bytes = new Array[Byte](fileLength.toInt)
|
||||||
|
val sizeRead = try {
|
||||||
|
val is = new FileInputStream(jar)
|
||||||
|
val read = is.read(bytes)
|
||||||
|
is.close()
|
||||||
|
read
|
||||||
|
} catch {
|
||||||
|
case ex: IOException =>
|
||||||
|
s.log.error(errPrefix + "failed to read sources jar archive\n" + ex.toString)
|
||||||
|
failSubmit()
|
||||||
|
}
|
||||||
|
if (sizeRead != bytes.length) {
|
||||||
|
s.log.error(errPrefix + "failed to read the sources jar archive, size read: " + sizeRead)
|
||||||
|
failSubmit()
|
||||||
|
} else encodeBase64(bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Task to package solution to a given file path */
|
||||||
|
lazy val packageSubmissionSetting = packageSubmission := {
|
||||||
|
val args: Seq[String] = Def.spaceDelimited("[path]").parsed
|
||||||
|
val s: TaskStreams = streams.value // for logging
|
||||||
|
val jar = (packageSubmissionZip in Compile).value
|
||||||
|
|
||||||
|
val base64Jar = prepareJar(jar, s)
|
||||||
|
|
||||||
|
val path = args.headOption.getOrElse((baseDirectory.value / "submission.jar").absolutePath)
|
||||||
|
scala.tools.nsc.io.File(path).writeAll(base64Jar)
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
/** Task to submit a solution to coursera */
|
||||||
|
val submit = inputKey[Unit]("submit solution to Coursera")
|
||||||
|
lazy val submitSetting = submit := {
|
||||||
|
// Fail if scalafix linting does not pass.
|
||||||
|
scalafixLinting.value
|
||||||
|
|
||||||
|
val args: Seq[String] = Def.spaceDelimited("<arg>").parsed
|
||||||
|
val s: TaskStreams = streams.value // for logging
|
||||||
|
val jar = (packageSubmissionZip in Compile).value
|
||||||
|
|
||||||
|
val assignmentDetails =
|
||||||
|
courseraId.?.value.getOrElse(throw new MessageOnlyException("This assignment can not be submitted to Coursera because the `courseraId` setting is undefined"))
|
||||||
|
val assignmentKey = assignmentDetails.key
|
||||||
|
val courseName =
|
||||||
|
course.value match {
|
||||||
|
case "capstone" => "scala-capstone"
|
||||||
|
case "bigdata" => "scala-spark-big-data"
|
||||||
|
case other => other
|
||||||
|
}
|
||||||
|
|
||||||
|
val partId = assignmentDetails.partId
|
||||||
|
val itemId = assignmentDetails.itemId
|
||||||
|
val premiumItemId = assignmentDetails.premiumItemId
|
||||||
|
|
||||||
|
val (email, secret) = args match {
|
||||||
|
case email :: secret :: Nil =>
|
||||||
|
(email, secret)
|
||||||
|
case _ =>
|
||||||
|
val inputErr =
|
||||||
|
s"""|Invalid input to `submit`. The required syntax for `submit` is:
|
||||||
|
|submit <email-address> <submit-token>
|
||||||
|
|
|
||||||
|
|The submit token is NOT YOUR LOGIN PASSWORD.
|
||||||
|
|It can be obtained from the assignment page:
|
||||||
|
|https://www.coursera.org/learn/$courseName/programming/$itemId
|
||||||
|
|${
|
||||||
|
premiumItemId.fold("") { id =>
|
||||||
|
s"""or (for premium learners):
|
||||||
|
|https://www.coursera.org/learn/$courseName/programming/$id
|
||||||
|
""".stripMargin
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""".stripMargin
|
||||||
|
s.log.error(inputErr)
|
||||||
|
failSubmit()
|
||||||
|
}
|
||||||
|
|
||||||
|
val base64Jar = prepareJar(jar, s)
|
||||||
|
val json =
|
||||||
|
s"""|{
|
||||||
|
| "assignmentKey":"$assignmentKey",
|
||||||
|
| "submitterEmail":"$email",
|
||||||
|
| "secret":"$secret",
|
||||||
|
| "parts":{
|
||||||
|
| "$partId":{
|
||||||
|
| "output":"$base64Jar"
|
||||||
|
| }
|
||||||
|
| }
|
||||||
|
|}""".stripMargin
|
||||||
|
|
||||||
|
def postSubmission[T](data: String): Try[HttpResponse[String]] = {
|
||||||
|
val http = Http("https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1")
|
||||||
|
val hs = List(
|
||||||
|
("Cache-Control", "no-cache"),
|
||||||
|
("Content-Type", "application/json")
|
||||||
|
)
|
||||||
|
s.log.info("Connecting to Coursera...")
|
||||||
|
val response = Try(http.postData(data)
|
||||||
|
.headers(hs)
|
||||||
|
.option(HttpOptions.connTimeout(10000)) // scalaj default timeout is only 100ms, changing that to 10s
|
||||||
|
.asString) // kick off HTTP POST
|
||||||
|
response
|
||||||
|
}
|
||||||
|
|
||||||
|
val connectMsg =
|
||||||
|
s"""|Attempting to submit "${assignment.value}" assignment in "$courseName" course
|
||||||
|
|Using:
|
||||||
|
|- email: $email
|
||||||
|
|- submit token: $secret""".stripMargin
|
||||||
|
s.log.info(connectMsg)
|
||||||
|
|
||||||
|
def reportCourseraResponse(response: HttpResponse[String]): Unit = {
|
||||||
|
val code = response.code
|
||||||
|
val respBody = response.body
|
||||||
|
|
||||||
|
/* Sample JSON response from Coursera
|
||||||
|
{
|
||||||
|
"message": "Invalid email or token.",
|
||||||
|
"details": {
|
||||||
|
"learnerMessage": "Invalid email or token."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Success, Coursera responds with 2xx HTTP status code
|
||||||
|
if (response.is2xx) {
|
||||||
|
val successfulSubmitMsg =
|
||||||
|
s"""|Successfully connected to Coursera. (Status $code)
|
||||||
|
|
|
||||||
|
|Assignment submitted successfully!
|
||||||
|
|
|
||||||
|
|You can see how you scored by going to:
|
||||||
|
|https://www.coursera.org/learn/$courseName/programming/$itemId/
|
||||||
|
|${
|
||||||
|
premiumItemId.fold("") { id =>
|
||||||
|
s"""or (for premium learners):
|
||||||
|
|https://www.coursera.org/learn/$courseName/programming/$id
|
||||||
|
""".stripMargin
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|and clicking on "My Submission".""".stripMargin
|
||||||
|
s.log.info(successfulSubmitMsg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Failure, Coursera responds with 4xx HTTP status code (client-side failure)
|
||||||
|
else if (response.is4xx) {
|
||||||
|
val result = Try(Json.parse(respBody)).toOption
|
||||||
|
val learnerMsg = result match {
|
||||||
|
case Some(resp: JsObject) =>
|
||||||
|
(JsPath \ "details" \ "learnerMessage").read[String].reads(resp).get
|
||||||
|
case Some(x) => // shouldn't happen
|
||||||
|
"Could not parse Coursera's response:\n" + x
|
||||||
|
case None =>
|
||||||
|
"Could not parse Coursera's response:\n" + respBody
|
||||||
|
}
|
||||||
|
val failedSubmitMsg =
|
||||||
|
s"""|Submission failed.
|
||||||
|
|There was something wrong while attempting to submit.
|
||||||
|
|Coursera says:
|
||||||
|
|$learnerMsg (Status $code)""".stripMargin
|
||||||
|
s.log.error(failedSubmitMsg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Failure, Coursera responds with 5xx HTTP status code (server-side failure)
|
||||||
|
else if (response.is5xx) {
|
||||||
|
val failedSubmitMsg =
|
||||||
|
s"""|Submission failed.
|
||||||
|
|Coursera seems to be unavailable at the moment (Status $code)
|
||||||
|
|Check https://status.coursera.org/ and try again in a few minutes.
|
||||||
|
""".stripMargin
|
||||||
|
s.log.error(failedSubmitMsg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Failure, Coursera repsonds with an unexpected status code
|
||||||
|
else {
|
||||||
|
val failedSubmitMsg =
|
||||||
|
s"""|Submission failed.
|
||||||
|
|Coursera replied with an unexpected code (Status $code)
|
||||||
|
""".stripMargin
|
||||||
|
s.log.error(failedSubmitMsg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// kick it all off, actually make request
|
||||||
|
postSubmission(json) match {
|
||||||
|
case Success(resp) => reportCourseraResponse(resp)
|
||||||
|
case Failure(e) =>
|
||||||
|
val failedConnectMsg =
|
||||||
|
s"""|Connection to Coursera failed.
|
||||||
|
|There was something wrong while attempting to connect to Coursera.
|
||||||
|
|Check your internet connection.
|
||||||
|
|${e.toString}""".stripMargin
|
||||||
|
s.log.error(failedConnectMsg)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
def failSubmit(): Nothing = {
|
||||||
|
sys.error("Submission failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* *****************
|
||||||
|
* DEALING WITH JARS
|
||||||
|
*/
|
||||||
|
def encodeBase64(bytes: Array[Byte]): String =
|
||||||
|
new String(Base64.encodeBase64(bytes))
|
||||||
|
}
|
||||||
1
project/build.properties
Normal file
1
project/build.properties
Normal file
@ -0,0 +1 @@
|
|||||||
|
sbt.version=1.3.8
|
||||||
5
project/buildSettings.sbt
Normal file
5
project/buildSettings.sbt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
// Used for Coursera submission (StudentPlugin)
|
||||||
|
// libraryDependencies += "org.scalaj" %% "scalaj-http" % "2.4.2"
|
||||||
|
// libraryDependencies += "com.typesafe.play" %% "play-json" % "2.7.4"
|
||||||
|
// Used for Base64 (StudentPlugin)
|
||||||
|
libraryDependencies += "commons-codec" % "commons-codec" % "1.10"
|
||||||
2
project/plugins.sbt
Normal file
2
project/plugins.sbt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
addSbtPlugin("org.scala-js" % "sbt-scalajs" % "0.6.28")
|
||||||
|
addSbtPlugin("ch.epfl.lamp" % "sbt-dotty" % "0.4.0")
|
||||||
0
src/main/resources/stackoverflow/.keep
Normal file
0
src/main/resources/stackoverflow/.keep
Normal file
308
src/main/scala/stackoverflow/StackOverflow.scala
Normal file
308
src/main/scala/stackoverflow/StackOverflow.scala
Normal file
@ -0,0 +1,308 @@
|
|||||||
|
package stackoverflow
|
||||||
|
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.SparkContext
|
||||||
|
import org.apache.spark.SparkContext._
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.log4j.{Logger, Level}
|
||||||
|
|
||||||
|
import annotation.tailrec
|
||||||
|
import scala.reflect.ClassTag
|
||||||
|
import scala.util.Properties.isWin
|
||||||
|
|
||||||
|
type Question = Posting
|
||||||
|
type Answer = Posting
|
||||||
|
type QID = Int
|
||||||
|
type HighScore = Int
|
||||||
|
type LangIndex = Int
|
||||||
|
|
||||||
|
/** A raw stackoverflow posting, either a question or an answer */
|
||||||
|
case class Posting(postingType: Int, id: Int, acceptedAnswer: Option[Int], parentId: Option[QID], score: Int, tags: Option[String]) extends Serializable
|
||||||
|
|
||||||
|
/** The main class */
|
||||||
|
object StackOverflow extends StackOverflow {
|
||||||
|
|
||||||
|
// Reduce Spark logging verbosity
|
||||||
|
Logger.getLogger("org").setLevel(Level.ERROR)
|
||||||
|
|
||||||
|
if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
|
||||||
|
|
||||||
|
@transient lazy val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
|
||||||
|
@transient lazy val sc: SparkContext = new SparkContext(conf)
|
||||||
|
|
||||||
|
/** Main function */
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
val lines = sc.textFile("src/main/resources/stackoverflow/stackoverflow-grading.csv")
|
||||||
|
val raw = rawPostings(lines)
|
||||||
|
val grouped = groupedPostings(raw)
|
||||||
|
val scored = scoredPostings(grouped)
|
||||||
|
val vectors = vectorPostings(scored)
|
||||||
|
// assert(vectors.count() == 2121822, "Incorrect number of vectors: " + vectors.count())
|
||||||
|
|
||||||
|
val means = kmeans(sampleVectors(vectors), vectors, debug = true)
|
||||||
|
val results = clusterResults(means, vectors)
|
||||||
|
printResults(results)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The parsing and kmeans methods */
|
||||||
|
class StackOverflow extends Serializable {
|
||||||
|
|
||||||
|
/** Languages */
|
||||||
|
val langs =
|
||||||
|
List(
|
||||||
|
"JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
|
||||||
|
"Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
|
||||||
|
|
||||||
|
/** K-means parameter: How "far apart" languages should be for the kmeans algorithm? */
|
||||||
|
def langSpread = 50000
|
||||||
|
assert(langSpread > 0, "If langSpread is zero we can't recover the language from the input data!")
|
||||||
|
|
||||||
|
/** K-means parameter: Number of clusters */
|
||||||
|
def kmeansKernels = 45
|
||||||
|
|
||||||
|
/** K-means parameter: Convergence criteria */
|
||||||
|
def kmeansEta: Double = 20.0D
|
||||||
|
|
||||||
|
/** K-means parameter: Maximum iterations */
|
||||||
|
def kmeansMaxIterations = 120
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Parsing utilities:
|
||||||
|
//
|
||||||
|
//
|
||||||
|
|
||||||
|
/** Load postings from the given file */
|
||||||
|
def rawPostings(lines: RDD[String]): RDD[Posting] =
|
||||||
|
lines.map(line => {
|
||||||
|
val arr = line.split(",")
|
||||||
|
Posting(postingType = arr(0).toInt,
|
||||||
|
id = arr(1).toInt,
|
||||||
|
acceptedAnswer = if (arr(2) == "") None else Some(arr(2).toInt),
|
||||||
|
parentId = if (arr(3) == "") None else Some(arr(3).toInt),
|
||||||
|
score = arr(4).toInt,
|
||||||
|
tags = if (arr.length >= 6) Some(arr(5).intern()) else None)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
/** Group the questions and answers together */
|
||||||
|
def groupedPostings(postings: RDD[Posting]): RDD[(QID, Iterable[(Question, Answer)])] = {
|
||||||
|
???
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Compute the maximum score for each posting */
|
||||||
|
def scoredPostings(grouped: RDD[(QID, Iterable[(Question, Answer)])]): RDD[(Question, HighScore)] = {
|
||||||
|
|
||||||
|
def answerHighScore(as: Array[Answer]): HighScore = {
|
||||||
|
var highScore = 0
|
||||||
|
var i = 0
|
||||||
|
while (i < as.length) {
|
||||||
|
val score = as(i).score
|
||||||
|
if (score > highScore)
|
||||||
|
highScore = score
|
||||||
|
i += 1
|
||||||
|
}
|
||||||
|
highScore
|
||||||
|
}
|
||||||
|
|
||||||
|
???
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Compute the vectors for the kmeans */
|
||||||
|
def vectorPostings(scored: RDD[(Question, HighScore)]): RDD[(LangIndex, HighScore)] = {
|
||||||
|
/** Return optional index of first language that occurs in `tags`. */
|
||||||
|
def firstLangInTag(tag: Option[String], ls: List[String]): Option[Int] = {
|
||||||
|
if (tag.isEmpty) None
|
||||||
|
else if (ls.isEmpty) None
|
||||||
|
else if (tag.get == ls.head) Some(0) // index: 0
|
||||||
|
else {
|
||||||
|
val tmp = firstLangInTag(tag, ls.tail)
|
||||||
|
tmp match {
|
||||||
|
case None => None
|
||||||
|
case Some(i) => Some(i + 1) // index i in ls.tail => index i+1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
???
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Sample the vectors */
|
||||||
|
def sampleVectors(vectors: RDD[(LangIndex, HighScore)]): Array[(Int, Int)] = {
|
||||||
|
|
||||||
|
assert(kmeansKernels % langs.length == 0, "kmeansKernels should be a multiple of the number of languages studied.")
|
||||||
|
val perLang = kmeansKernels / langs.length
|
||||||
|
|
||||||
|
// http://en.wikipedia.org/wiki/Reservoir_sampling
|
||||||
|
def reservoirSampling(lang: Int, iter: Iterator[Int], size: Int): Array[Int] = {
|
||||||
|
val res = new Array[Int](size)
|
||||||
|
val rnd = new util.Random(lang)
|
||||||
|
|
||||||
|
for (i <- 0 until size) {
|
||||||
|
assert(iter.hasNext, s"iterator must have at least $size elements")
|
||||||
|
res(i) = iter.next
|
||||||
|
}
|
||||||
|
|
||||||
|
var i = size.toLong
|
||||||
|
while (iter.hasNext) {
|
||||||
|
val elt = iter.next
|
||||||
|
val j = math.abs(rnd.nextLong) % i
|
||||||
|
if (j < size)
|
||||||
|
res(j.toInt) = elt
|
||||||
|
i += 1
|
||||||
|
}
|
||||||
|
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
|
val res =
|
||||||
|
if (langSpread < 500)
|
||||||
|
// sample the space regardless of the language
|
||||||
|
vectors.takeSample(false, kmeansKernels, 42)
|
||||||
|
else
|
||||||
|
// sample the space uniformly from each language partition
|
||||||
|
vectors.groupByKey.flatMap({
|
||||||
|
case (lang, vectors) => reservoirSampling(lang, vectors.iterator, perLang).map((lang, _))
|
||||||
|
}).collect()
|
||||||
|
|
||||||
|
assert(res.length == kmeansKernels, res.length)
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Kmeans method:
|
||||||
|
//
|
||||||
|
//
|
||||||
|
|
||||||
|
/** Main kmeans computation */
|
||||||
|
@tailrec final def kmeans(means: Array[(Int, Int)], vectors: RDD[(Int, Int)], iter: Int = 1, debug: Boolean = false): Array[(Int, Int)] = {
|
||||||
|
val newMeans = means.clone() // you need to compute newMeans
|
||||||
|
|
||||||
|
// TODO: Fill in the newMeans array
|
||||||
|
val distance = euclideanDistance(means, newMeans)
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
println(s"""Iteration: $iter
|
||||||
|
| * current distance: $distance
|
||||||
|
| * desired distance: $kmeansEta
|
||||||
|
| * means:""".stripMargin)
|
||||||
|
for (idx <- 0 until kmeansKernels)
|
||||||
|
println(f" ${means(idx).toString}%20s ==> ${newMeans(idx).toString}%20s " +
|
||||||
|
f" distance: ${euclideanDistance(means(idx), newMeans(idx))}%8.0f")
|
||||||
|
}
|
||||||
|
|
||||||
|
if (converged(distance))
|
||||||
|
newMeans
|
||||||
|
else if (iter < kmeansMaxIterations)
|
||||||
|
kmeans(newMeans, vectors, iter + 1, debug)
|
||||||
|
else {
|
||||||
|
if (debug) {
|
||||||
|
println("Reached max iterations!")
|
||||||
|
}
|
||||||
|
newMeans
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Kmeans utilities:
|
||||||
|
//
|
||||||
|
//
|
||||||
|
|
||||||
|
/** Decide whether the kmeans clustering converged */
|
||||||
|
def converged(distance: Double) =
|
||||||
|
distance < kmeansEta
|
||||||
|
|
||||||
|
|
||||||
|
/** Return the euclidean distance between two points */
|
||||||
|
def euclideanDistance(v1: (Int, Int), v2: (Int, Int)): Double = {
|
||||||
|
val part1 = (v1._1 - v2._1).toDouble * (v1._1 - v2._1)
|
||||||
|
val part2 = (v1._2 - v2._2).toDouble * (v1._2 - v2._2)
|
||||||
|
part1 + part2
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the euclidean distance between two points */
|
||||||
|
def euclideanDistance(a1: Array[(Int, Int)], a2: Array[(Int, Int)]): Double = {
|
||||||
|
assert(a1.length == a2.length)
|
||||||
|
var sum = 0d
|
||||||
|
var idx = 0
|
||||||
|
while(idx < a1.length) {
|
||||||
|
sum += euclideanDistance(a1(idx), a2(idx))
|
||||||
|
idx += 1
|
||||||
|
}
|
||||||
|
sum
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the closest point */
|
||||||
|
def findClosest(p: (Int, Int), centers: Array[(Int, Int)]): Int = {
|
||||||
|
var bestIndex = 0
|
||||||
|
var closest = Double.PositiveInfinity
|
||||||
|
for (i <- 0 until centers.length) {
|
||||||
|
val tempDist = euclideanDistance(p, centers(i))
|
||||||
|
if (tempDist < closest) {
|
||||||
|
closest = tempDist
|
||||||
|
bestIndex = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bestIndex
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Average the vectors */
|
||||||
|
def averageVectors(ps: Iterable[(Int, Int)]): (Int, Int) = {
|
||||||
|
val iter = ps.iterator
|
||||||
|
var count = 0
|
||||||
|
var comp1: Long = 0
|
||||||
|
var comp2: Long = 0
|
||||||
|
while (iter.hasNext) {
|
||||||
|
val item = iter.next
|
||||||
|
comp1 += item._1
|
||||||
|
comp2 += item._2
|
||||||
|
count += 1
|
||||||
|
}
|
||||||
|
((comp1 / count).toInt, (comp2 / count).toInt)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Displaying results:
|
||||||
|
//
|
||||||
|
//
|
||||||
|
def clusterResults(means: Array[(Int, Int)], vectors: RDD[(LangIndex, HighScore)]): Array[(String, Double, Int, Int)] = {
|
||||||
|
val closest = vectors.map(p => (findClosest(p, means), p))
|
||||||
|
val closestGrouped = closest.groupByKey()
|
||||||
|
|
||||||
|
val median = closestGrouped.mapValues { vs =>
|
||||||
|
val langLabel: String = ??? // most common language in the cluster
|
||||||
|
val langPercent: Double = ??? // percent of the questions in the most common language
|
||||||
|
val clusterSize: Int = ???
|
||||||
|
val medianScore: Int = ???
|
||||||
|
|
||||||
|
(langLabel, langPercent, clusterSize, medianScore)
|
||||||
|
}
|
||||||
|
|
||||||
|
median.collect().map(_._2).sortBy(_._4)
|
||||||
|
}
|
||||||
|
|
||||||
|
def printResults(results: Array[(String, Double, Int, Int)]): Unit = {
|
||||||
|
println("Resulting clusters:")
|
||||||
|
println(" Score Dominant language (%percent) Questions")
|
||||||
|
println("================================================")
|
||||||
|
for ((lang, percent, size, score) <- results)
|
||||||
|
println(f"${score}%7d ${lang}%-17s (${percent}%-5.1f%%) ${size}%7d")
|
||||||
|
}
|
||||||
|
}
|
||||||
47
src/test/scala/stackoverflow/StackOverflowSuite.scala
Normal file
47
src/test/scala/stackoverflow/StackOverflowSuite.scala
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
package stackoverflow
|
||||||
|
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.SparkContext
|
||||||
|
import org.apache.spark.SparkContext._
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.junit._
|
||||||
|
import org.junit.Assert.assertEquals
|
||||||
|
import java.io.File
|
||||||
|
import scala.io.{ Codec, Source }
|
||||||
|
import scala.util.Properties.isWin
|
||||||
|
|
||||||
|
object StackOverflowSuite {
|
||||||
|
if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
|
||||||
|
|
||||||
|
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
|
||||||
|
val sc: SparkContext = new SparkContext(conf)
|
||||||
|
}
|
||||||
|
|
||||||
|
class StackOverflowSuite {
|
||||||
|
import StackOverflowSuite._
|
||||||
|
|
||||||
|
|
||||||
|
lazy val testObject = new StackOverflow {
|
||||||
|
override val langs =
|
||||||
|
List(
|
||||||
|
"JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
|
||||||
|
"Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
|
||||||
|
override def langSpread = 50000
|
||||||
|
override def kmeansKernels = 45
|
||||||
|
override def kmeansEta: Double = 20.0D
|
||||||
|
override def kmeansMaxIterations = 120
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test def `testObject can be instantiated`: Unit = {
|
||||||
|
val instantiatable = try {
|
||||||
|
testObject
|
||||||
|
true
|
||||||
|
} catch {
|
||||||
|
case _: Throwable => false
|
||||||
|
}
|
||||||
|
assert(instantiatable, "Can't instantiate a StackOverflow object")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Rule def individualTestTimeout = new org.junit.rules.Timeout(300 * 1000)
|
||||||
|
}
|
||||||
BIN
winutils/hadoop-2.7.4/bin/winutils.exe
Normal file
BIN
winutils/hadoop-2.7.4/bin/winutils.exe
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user