commit e408ad50aec693fddf9d462f00390a93241017d5 Author: Guillaume Martres Date: Tue Feb 19 20:44:23 2019 +0100 Add wikipedia assignment diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..996b5d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +# General +*.DS_Store +*.swp +*~ + +# Dotty +*.class +*.tasty +*.hasTasty + +# sbt +target/ + +# Dotty IDE +/.dotty-ide-artifact +/.dotty-ide.json + +# datasets +stackoverflow-grading.csv +wikipedia-grading.dat diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..e5ec826 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,36 @@ +# DO NOT EDIT THIS FILE + +stages: + - build + - grade + +compile: + stage: build + image: lampepfl/moocs:dotty-2020-02-12 + except: + - tags + tags: + - cs206 + script: + - sbt packageSubmission + artifacts: + expire_in: 1 day + paths: + - submission.jar + +grade: + stage: grade + except: + - tags + tags: + - cs206 + image: + name: smarter3/moocs:bigdata-wikipedia-2020-04-30-3 + entrypoint: [""] + allow_failure: true + before_script: + - mkdir -p /shared/submission/ + - cp submission.jar /shared/submission/submission.jar + script: + - cd /grader + - /grader/grade | /grader/feedback-printer diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a35362b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "dotty": { + "trace": { + "remoteTracingUrl": "wss://lamppc36.epfl.ch/dotty-remote-tracer/upload/lsp.log", + "server": { "format": "JSON", "verbosity": "verbose" } + } + } +} diff --git a/assignment.sbt b/assignment.sbt new file mode 100644 index 0000000..4dd1761 --- /dev/null +++ b/assignment.sbt @@ -0,0 +1,9 @@ +// Student tasks (i.e. submit, packageSubmission) +enablePlugins(StudentTasks) + +courseraId := ch.epfl.lamp.CourseraId( + key = "EH8wby4kEeawURILfHIqjw", + itemId = "CfQX2", + premiumItemId = Some("QcWcs"), + partId = "5komc" +) diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..d35eb5a --- /dev/null +++ b/build.sbt @@ -0,0 +1,20 @@ +course := "bigdata" +assignment := "wikipedia" + +scalaVersion := "0.24.0-RC1" +scalacOptions ++= Seq("-language:implicitConversions", "-deprecation") +libraryDependencies ++= Seq( + "com.novocode" % "junit-interface" % "0.11" % Test, + ("org.apache.spark" %% "spark-core" % "3.0.0-X1").withDottyCompat(scalaVersion.value), +) + +// Contains Spark 3 snapshot built against 2.13: https://github.com/smarter/spark/tree/scala-2.13 +resolvers += Resolver.bintrayRepo("smarter", "maven") + +testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s") + +testSuite := "wikipedia.WikipediaSuite" + +// Without forking, ctrl-c doesn't actually fully stop Spark +fork in run := true +fork in Test := true diff --git a/grading-tests.jar b/grading-tests.jar new file mode 100644 index 0000000..0ab663e Binary files /dev/null and b/grading-tests.jar differ diff --git a/project/MOOCSettings.scala b/project/MOOCSettings.scala new file mode 100644 index 0000000..171244f --- /dev/null +++ b/project/MOOCSettings.scala @@ -0,0 +1,46 @@ +package ch.epfl.lamp + +import sbt._ +import sbt.Keys._ + +/** + * Coursera uses two versions of each assignment. They both have the same assignment key and part id but have + * different item ids. + * + * @param key Assignment key + * @param partId Assignment partId + * @param itemId Item id of the non premium version + * @param premiumItemId Item id of the premium version (`None` if the assignment is optional) + */ +case class CourseraId(key: String, partId: String, itemId: String, premiumItemId: Option[String]) + +/** + * Settings shared by all assignments, reused in various tasks. + */ +object MOOCSettings extends AutoPlugin { + + object autoImport { + val course = SettingKey[String]("course") + val assignment = SettingKey[String]("assignment") + val options = SettingKey[Map[String, Map[String, String]]]("options") + val courseraId = settingKey[CourseraId]("Coursera-specific information identifying the assignment") + val testSuite = settingKey[String]("Fully qualified name of the test suite of this assignment") + // Convenient alias + type CourseraId = ch.epfl.lamp.CourseraId + val CourseraId = ch.epfl.lamp.CourseraId + } + + import autoImport._ + + override val globalSettings: Seq[Def.Setting[_]] = Seq( + // supershell is verbose, buggy and useless. + useSuperShell := false + ) + + override val projectSettings: Seq[Def.Setting[_]] = Seq( + parallelExecution in Test := false, + // Report test result after each test instead of waiting for every test to finish + logBuffered in Test := false, + name := s"${course.value}-${assignment.value}" + ) +} diff --git a/project/StudentTasks.scala b/project/StudentTasks.scala new file mode 100644 index 0000000..7604830 --- /dev/null +++ b/project/StudentTasks.scala @@ -0,0 +1,318 @@ +package ch.epfl.lamp + +import sbt._ +import Keys._ + +// import scalaj.http._ +import java.io.{File, FileInputStream, IOException} +import org.apache.commons.codec.binary.Base64 +// import play.api.libs.json.{Json, JsObject, JsPath} +import scala.util.{Failure, Success, Try} + +/** + * Provides tasks for submitting the assignment + */ +object StudentTasks extends AutoPlugin { + + override def requires = super.requires && MOOCSettings + + object autoImport { + val packageSourcesOnly = TaskKey[File]("packageSourcesOnly", "Package the sources of the project") + val packageBinWithoutResources = TaskKey[File]("packageBinWithoutResources", "Like packageBin, but without the resources") + val packageSubmissionZip = TaskKey[File]("packageSubmissionZip") + val packageSubmission = inputKey[Unit]("package solution as an archive file") + val runGradingTests = taskKey[Unit]("run black-box tests used for final grading") + } + + + import autoImport._ + import MOOCSettings.autoImport._ + + override lazy val projectSettings = Seq( + packageSubmissionSetting, + // submitSetting, + runGradingTestsSettings, + + fork := true, + connectInput in run := true, + outputStrategy := Some(StdoutOutput), + ) ++ packageSubmissionZipSettings + + lazy val runGradingTestsSettings = runGradingTests := { + val testSuiteJar = "grading-tests.jar" + if (!new File(testSuiteJar).exists) { + throw new MessageOnlyException(s"Could not find tests JarFile: $testSuiteJar") + } + + val classPath = s"${(Test / dependencyClasspath).value.map(_.data).mkString(File.pathSeparator)}${File.pathSeparator}$testSuiteJar" + val junitProcess = + Fork.java.fork( + ForkOptions(), + "-cp" :: classPath :: + "org.junit.runner.JUnitCore" :: + (Test / testSuite).value :: + Nil + ) + + // Wait for tests to complete. + junitProcess.exitValue() + } + + + /** ********************************************************** + * SUBMITTING A SOLUTION TO COURSERA + */ + + val packageSubmissionZipSettings = Seq( + packageSubmissionZip := { + val submission = crossTarget.value / "submission.zip" + val sources = (packageSourcesOnly in Compile).value + val binaries = (packageBinWithoutResources in Compile).value + IO.zip(Seq(sources -> "sources.zip", binaries -> "binaries.jar"), submission) + submission + }, + artifactClassifier in packageSourcesOnly := Some("sources"), + artifact in (Compile, packageBinWithoutResources) ~= (art => art.withName(art.name + "-without-resources")) + ) ++ + inConfig(Compile)( + Defaults.packageTaskSettings(packageSourcesOnly, Defaults.sourceMappings) ++ + Defaults.packageTaskSettings(packageBinWithoutResources, Def.task { + val relativePaths = + (unmanagedResources in Compile).value.flatMap(Path.relativeTo((unmanagedResourceDirectories in Compile).value)(_)) + (mappings in (Compile, packageBin)).value.filterNot { case (_, path) => relativePaths.contains(path) } + }) + ) + + val maxSubmitFileSize = { + val mb = 1024 * 1024 + 10 * mb + } + + /** Check that the jar exists, isn't empty, isn't crazy big, and can be read + * If so, encode jar as base64 so we can send it to Coursera + */ + def prepareJar(jar: File, s: TaskStreams): String = { + val errPrefix = "Error submitting assignment jar: " + val fileLength = jar.length() + if (!jar.exists()) { + s.log.error(errPrefix + "jar archive does not exist\n" + jar.getAbsolutePath) + failSubmit() + } else if (fileLength == 0L) { + s.log.error(errPrefix + "jar archive is empty\n" + jar.getAbsolutePath) + failSubmit() + } else if (fileLength > maxSubmitFileSize) { + s.log.error(errPrefix + "jar archive is too big. Allowed size: " + + maxSubmitFileSize + " bytes, found " + fileLength + " bytes.\n" + + jar.getAbsolutePath) + failSubmit() + } else { + val bytes = new Array[Byte](fileLength.toInt) + val sizeRead = try { + val is = new FileInputStream(jar) + val read = is.read(bytes) + is.close() + read + } catch { + case ex: IOException => + s.log.error(errPrefix + "failed to read sources jar archive\n" + ex.toString) + failSubmit() + } + if (sizeRead != bytes.length) { + s.log.error(errPrefix + "failed to read the sources jar archive, size read: " + sizeRead) + failSubmit() + } else encodeBase64(bytes) + } + } + + /** Task to package solution to a given file path */ + lazy val packageSubmissionSetting = packageSubmission := { + val args: Seq[String] = Def.spaceDelimited("[path]").parsed + val s: TaskStreams = streams.value // for logging + val jar = (packageSubmissionZip in Compile).value + + val base64Jar = prepareJar(jar, s) + + val path = args.headOption.getOrElse((baseDirectory.value / "submission.jar").absolutePath) + scala.tools.nsc.io.File(path).writeAll(base64Jar) + } + +/* + /** Task to submit a solution to coursera */ + val submit = inputKey[Unit]("submit solution to Coursera") + lazy val submitSetting = submit := { + // Fail if scalafix linting does not pass. + scalafixLinting.value + + val args: Seq[String] = Def.spaceDelimited("").parsed + val s: TaskStreams = streams.value // for logging + val jar = (packageSubmissionZip in Compile).value + + val assignmentDetails = + courseraId.?.value.getOrElse(throw new MessageOnlyException("This assignment can not be submitted to Coursera because the `courseraId` setting is undefined")) + val assignmentKey = assignmentDetails.key + val courseName = + course.value match { + case "capstone" => "scala-capstone" + case "bigdata" => "scala-spark-big-data" + case other => other + } + + val partId = assignmentDetails.partId + val itemId = assignmentDetails.itemId + val premiumItemId = assignmentDetails.premiumItemId + + val (email, secret) = args match { + case email :: secret :: Nil => + (email, secret) + case _ => + val inputErr = + s"""|Invalid input to `submit`. The required syntax for `submit` is: + |submit + | + |The submit token is NOT YOUR LOGIN PASSWORD. + |It can be obtained from the assignment page: + |https://www.coursera.org/learn/$courseName/programming/$itemId + |${ + premiumItemId.fold("") { id => + s"""or (for premium learners): + |https://www.coursera.org/learn/$courseName/programming/$id + """.stripMargin + } + } + """.stripMargin + s.log.error(inputErr) + failSubmit() + } + + val base64Jar = prepareJar(jar, s) + val json = + s"""|{ + | "assignmentKey":"$assignmentKey", + | "submitterEmail":"$email", + | "secret":"$secret", + | "parts":{ + | "$partId":{ + | "output":"$base64Jar" + | } + | } + |}""".stripMargin + + def postSubmission[T](data: String): Try[HttpResponse[String]] = { + val http = Http("https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1") + val hs = List( + ("Cache-Control", "no-cache"), + ("Content-Type", "application/json") + ) + s.log.info("Connecting to Coursera...") + val response = Try(http.postData(data) + .headers(hs) + .option(HttpOptions.connTimeout(10000)) // scalaj default timeout is only 100ms, changing that to 10s + .asString) // kick off HTTP POST + response + } + + val connectMsg = + s"""|Attempting to submit "${assignment.value}" assignment in "$courseName" course + |Using: + |- email: $email + |- submit token: $secret""".stripMargin + s.log.info(connectMsg) + + def reportCourseraResponse(response: HttpResponse[String]): Unit = { + val code = response.code + val respBody = response.body + + /* Sample JSON response from Coursera + { + "message": "Invalid email or token.", + "details": { + "learnerMessage": "Invalid email or token." + } + } + */ + + // Success, Coursera responds with 2xx HTTP status code + if (response.is2xx) { + val successfulSubmitMsg = + s"""|Successfully connected to Coursera. (Status $code) + | + |Assignment submitted successfully! + | + |You can see how you scored by going to: + |https://www.coursera.org/learn/$courseName/programming/$itemId/ + |${ + premiumItemId.fold("") { id => + s"""or (for premium learners): + |https://www.coursera.org/learn/$courseName/programming/$id + """.stripMargin + } + } + |and clicking on "My Submission".""".stripMargin + s.log.info(successfulSubmitMsg) + } + + // Failure, Coursera responds with 4xx HTTP status code (client-side failure) + else if (response.is4xx) { + val result = Try(Json.parse(respBody)).toOption + val learnerMsg = result match { + case Some(resp: JsObject) => + (JsPath \ "details" \ "learnerMessage").read[String].reads(resp).get + case Some(x) => // shouldn't happen + "Could not parse Coursera's response:\n" + x + case None => + "Could not parse Coursera's response:\n" + respBody + } + val failedSubmitMsg = + s"""|Submission failed. + |There was something wrong while attempting to submit. + |Coursera says: + |$learnerMsg (Status $code)""".stripMargin + s.log.error(failedSubmitMsg) + } + + // Failure, Coursera responds with 5xx HTTP status code (server-side failure) + else if (response.is5xx) { + val failedSubmitMsg = + s"""|Submission failed. + |Coursera seems to be unavailable at the moment (Status $code) + |Check https://status.coursera.org/ and try again in a few minutes. + """.stripMargin + s.log.error(failedSubmitMsg) + } + + // Failure, Coursera repsonds with an unexpected status code + else { + val failedSubmitMsg = + s"""|Submission failed. + |Coursera replied with an unexpected code (Status $code) + """.stripMargin + s.log.error(failedSubmitMsg) + } + } + + // kick it all off, actually make request + postSubmission(json) match { + case Success(resp) => reportCourseraResponse(resp) + case Failure(e) => + val failedConnectMsg = + s"""|Connection to Coursera failed. + |There was something wrong while attempting to connect to Coursera. + |Check your internet connection. + |${e.toString}""".stripMargin + s.log.error(failedConnectMsg) + } + + } +*/ + + def failSubmit(): Nothing = { + sys.error("Submission failed") + } + + /** + * ***************** + * DEALING WITH JARS + */ + def encodeBase64(bytes: Array[Byte]): String = + new String(Base64.encodeBase64(bytes)) +} diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 0000000..a919a9b --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.3.8 diff --git a/project/buildSettings.sbt b/project/buildSettings.sbt new file mode 100644 index 0000000..8fac702 --- /dev/null +++ b/project/buildSettings.sbt @@ -0,0 +1,5 @@ +// Used for Coursera submission (StudentPlugin) +// libraryDependencies += "org.scalaj" %% "scalaj-http" % "2.4.2" +// libraryDependencies += "com.typesafe.play" %% "play-json" % "2.7.4" +// Used for Base64 (StudentPlugin) +libraryDependencies += "commons-codec" % "commons-codec" % "1.10" diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..017735d --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,2 @@ +addSbtPlugin("org.scala-js" % "sbt-scalajs" % "0.6.28") +addSbtPlugin("ch.epfl.lamp" % "sbt-dotty" % "0.4.0") diff --git a/src/main/resources/wikipedia/.keep b/src/main/resources/wikipedia/.keep new file mode 100644 index 0000000..e69de29 diff --git a/src/main/scala/wikipedia/WikipediaData.scala b/src/main/scala/wikipedia/WikipediaData.scala new file mode 100644 index 0000000..2bfc238 --- /dev/null +++ b/src/main/scala/wikipedia/WikipediaData.scala @@ -0,0 +1,21 @@ +package wikipedia + +import scala.io.Source + +object WikipediaData { + + private[wikipedia] def lines: List[String] = { + Option(getClass.getResourceAsStream("/wikipedia/wikipedia-grading.dat")) match { + case None => sys.error("Please download the dataset as explained in the assignment instructions") + case Some(resource) => Source.fromInputStream(resource).getLines().toList + } + } + + private[wikipedia] def parse(line: String): WikipediaArticle = { + val subs = "" + val i = line.indexOf(subs) + val title = line.substring(14, i) + val text = line.substring(i + subs.length, line.length-16) + WikipediaArticle(title, text) + } +} diff --git a/src/main/scala/wikipedia/WikipediaRanking.scala b/src/main/scala/wikipedia/WikipediaRanking.scala new file mode 100644 index 0000000..1306b61 --- /dev/null +++ b/src/main/scala/wikipedia/WikipediaRanking.scala @@ -0,0 +1,96 @@ +package wikipedia + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.apache.log4j.{Logger, Level} + +import org.apache.spark.rdd.RDD + +case class WikipediaArticle(title: String, text: String) { + /** + * @return Whether the text of this article mentions `lang` or not + * @param lang Language to look for (e.g. "Scala") + */ + def mentionsLanguage(lang: String): Boolean = text.split(' ').contains(lang) +} + +object WikipediaRanking extends WikipediaRankingInterface { + // Reduce Spark logging verbosity + Logger.getLogger("org").setLevel(Level.ERROR) + + val langs = List( + "JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS", + "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy") + + val conf: SparkConf = ??? + val sc: SparkContext = ??? + // Hint: use a combination of `sc.parallelize`, `WikipediaData.lines` and `WikipediaData.parse` + val wikiRdd: RDD[WikipediaArticle] = ??? + + /** Returns the number of articles on which the language `lang` occurs. + * Hint1: consider using method `aggregate` on RDD[T]. + * Hint2: consider using method `mentionsLanguage` on `WikipediaArticle` + */ + def occurrencesOfLang(lang: String, rdd: RDD[WikipediaArticle]): Int = ??? + + /* (1) Use `occurrencesOfLang` to compute the ranking of the languages + * (`val langs`) by determining the number of Wikipedia articles that + * mention each language at least once. Don't forget to sort the + * languages by their occurrence, in decreasing order! + * + * Note: this operation is long-running. It can potentially run for + * several seconds. + */ + def rankLangs(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] = ??? + + /* Compute an inverted index of the set of articles, mapping each language + * to the Wikipedia pages in which it occurs. + */ + def makeIndex(langs: List[String], rdd: RDD[WikipediaArticle]): RDD[(String, Iterable[WikipediaArticle])] = ??? + + /* (2) Compute the language ranking again, but now using the inverted index. Can you notice + * a performance improvement? + * + * Note: this operation is long-running. It can potentially run for + * several seconds. + */ + def rankLangsUsingIndex(index: RDD[(String, Iterable[WikipediaArticle])]): List[(String, Int)] = ??? + + /* (3) Use `reduceByKey` so that the computation of the index and the ranking are combined. + * Can you notice an improvement in performance compared to measuring *both* the computation of the index + * and the computation of the ranking? If so, can you think of a reason? + * + * Note: this operation is long-running. It can potentially run for + * several seconds. + */ + def rankLangsReduceByKey(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] = ??? + + def main(args: Array[String]): Unit = { + + /* Languages ranked according to (1) */ + val langsRanked: List[(String, Int)] = timed("Part 1: naive ranking", rankLangs(langs, wikiRdd)) + + /* An inverted index mapping languages to wikipedia pages on which they appear */ + def index: RDD[(String, Iterable[WikipediaArticle])] = makeIndex(langs, wikiRdd) + + /* Languages ranked according to (2), using the inverted index */ + val langsRanked2: List[(String, Int)] = timed("Part 2: ranking using inverted index", rankLangsUsingIndex(index)) + + /* Languages ranked according to (3) */ + val langsRanked3: List[(String, Int)] = timed("Part 3: ranking using reduceByKey", rankLangsReduceByKey(langs, wikiRdd)) + + /* Output the speed of each ranking */ + println(timing) + sc.stop() + } + + val timing = new StringBuffer + def timed[T](label: String, code: => T): T = { + val start = System.currentTimeMillis() + val result = code + val stop = System.currentTimeMillis() + timing.append(s"Processing $label took ${stop - start} ms.\n") + result + } +} diff --git a/src/main/scala/wikipedia/WikipediaRankingInterface.scala b/src/main/scala/wikipedia/WikipediaRankingInterface.scala new file mode 100644 index 0000000..eacef4f --- /dev/null +++ b/src/main/scala/wikipedia/WikipediaRankingInterface.scala @@ -0,0 +1,20 @@ +package wikipedia + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.RDD + +/** + * The interface used by the grading infrastructure. Do not change signatures + * or your submission will fail with a NoSuchMethodError. + */ +trait WikipediaRankingInterface { + def makeIndex(langs: List[String], rdd: RDD[WikipediaArticle]): RDD[(String, Iterable[WikipediaArticle])] + def occurrencesOfLang(lang: String, rdd: RDD[WikipediaArticle]): Int + def rankLangs(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] + def rankLangsReduceByKey(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] + def rankLangsUsingIndex(index: RDD[(String, Iterable[WikipediaArticle])]): List[(String, Int)] + def langs: List[String] + def sc: SparkContext + def wikiRdd: RDD[WikipediaArticle] +} diff --git a/src/test/scala/wikipedia/WikipediaSuite.scala b/src/test/scala/wikipedia/WikipediaSuite.scala new file mode 100644 index 0000000..aff7d25 --- /dev/null +++ b/src/test/scala/wikipedia/WikipediaSuite.scala @@ -0,0 +1,142 @@ +package wikipedia + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.junit._ + +class WikipediaSuite { + def initializeWikipediaRanking(): Boolean = + try { + WikipediaRanking + true + } catch { + case ex: Throwable => + println(ex.getMessage) + ex.printStackTrace() + false + } + + import WikipediaRanking._ + + /** + * Creates a truncated string representation of a list, adding ", ...)" if there + * are too many elements to show + * @param l The list to preview + * @param n The number of elements to cut it at + * @return A preview of the list, containing at most n elements. + */ + def previewList[A](l: List[A], n: Int = 10): String = + if (l.length <= n) l.toString + else l.take(n).toString.dropRight(1) + ", ...)" + + /** + * Asserts that all the elements in a given list and an expected list are the same, + * regardless of order. For a prettier output, given and expected should be sorted + * with the same ordering. + * @param actual The actual list + * @param expected The expected list + * @tparam A Type of the list elements + */ + def assertSameElements[A](actual: List[A], expected: List[A]): Unit = { + val givenSet = actual.toSet + val expectedSet = expected.toSet + + val unexpected = givenSet -- expectedSet + val missing = expectedSet -- givenSet + + val noUnexpectedElements = unexpected.isEmpty + val noMissingElements = missing.isEmpty + + val noMatchString = + s""" + |Expected: ${previewList(expected)} + |Actual: ${previewList(actual)}""".stripMargin + + assert(noUnexpectedElements, + s"""|$noMatchString + |The given collection contains some unexpected elements: ${previewList(unexpected.toList, 5)}""".stripMargin) + + assert(noMissingElements, + s"""|$noMatchString + |The given collection is missing some expected elements: ${previewList(missing.toList, 5)}""".stripMargin) + } + + // Conditions: + // (1) the language stats contain the same elements + // (2) they are ordered (and the order doesn't matter if there are several languages with the same count) + def assertEquivalentAndOrdered(actual: List[(String, Int)], expected: List[(String, Int)]): Unit = { + // (1) + assertSameElements(actual, expected) + // (2) + assert( + !(actual zip actual.tail).exists({ case ((_, occ1), (_, occ2)) => occ1 < occ2 }), + "The given elements are not in descending order" + ) + } + + @Test def `'occurrencesOfLang' should work for (specific) RDD with one element`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val rdd = sc.parallelize(Seq(WikipediaArticle("title", "Java Jakarta"))) + val res = (occurrencesOfLang("Java", rdd) == 1) + assert(res, "occurrencesOfLang given (specific) RDD with one element should equal to 1") + } + + @Test def `'rankLangs' should work for RDD with two elements`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val langs = List("Scala", "Java") + val rdd = sc.parallelize(List(WikipediaArticle("1", "Scala is great"), WikipediaArticle("2", "Java is OK, but Scala is cooler"))) + val ranked = rankLangs(langs, rdd) + val res = ranked.head._1 == "Scala" + assert(res) + } + + @Test def `'makeIndex' creates a simple index with two entries`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val langs = List("Scala", "Java") + val articles = List( + WikipediaArticle("1","Groovy is pretty interesting, and so is Erlang"), + WikipediaArticle("2","Scala and Java run on the JVM"), + WikipediaArticle("3","Scala is not purely functional") + ) + val rdd = sc.parallelize(articles) + val index = makeIndex(langs, rdd) + val res = index.count() == 2 + assert(res) + } + + @Test def `'rankLangsUsingIndex' should work for a simple RDD with three elements`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val langs = List("Scala", "Java") + val articles = List( + WikipediaArticle("1","Groovy is pretty interesting, and so is Erlang"), + WikipediaArticle("2","Scala and Java run on the JVM"), + WikipediaArticle("3","Scala is not purely functional") + ) + val rdd = sc.parallelize(articles) + val index = makeIndex(langs, rdd) + val ranked = rankLangsUsingIndex(index) + val res = (ranked.head._1 == "Scala") + assert(res) + } + + @Test def `'rankLangsReduceByKey' should work for a simple RDD with five elements`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val langs = List("Scala", "Java", "Groovy", "Haskell", "Erlang") + val articles = List( + WikipediaArticle("1","Groovy is pretty interesting, and so is Erlang"), + WikipediaArticle("2","Scala and Java run on the JVM"), + WikipediaArticle("3","Scala is not purely functional"), + WikipediaArticle("4","The cool kids like Haskell more than Java"), + WikipediaArticle("5","Java is for enterprise developers") + ) + val rdd = sc.parallelize(articles) + val ranked = rankLangsReduceByKey(langs, rdd) + val res = (ranked.head._1 == "Java") + assert(res) + } + + + + @Rule def individualTestTimeout = new org.junit.rules.Timeout(100 * 1000) +}