Compare commits
No commits in common. "submission-stackoverflow" and "actorbintree" have entirely different histories.
submission
...
actorbintr
4
.gitignore
vendored
4
.gitignore
vendored
@ -14,7 +14,3 @@ target/
|
|||||||
# Dotty IDE
|
# Dotty IDE
|
||||||
/.dotty-ide-artifact
|
/.dotty-ide-artifact
|
||||||
/.dotty-ide.json
|
/.dotty-ide.json
|
||||||
|
|
||||||
# datasets
|
|
||||||
stackoverflow-grading.csv
|
|
||||||
wikipedia-grading.dat
|
|
||||||
|
|||||||
@ -25,7 +25,7 @@ grade:
|
|||||||
tags:
|
tags:
|
||||||
- cs206
|
- cs206
|
||||||
image:
|
image:
|
||||||
name: smarter3/moocs:bigdata-stackoverflow-2020-05-11-2
|
name: smarter3/moocs:reactive-actorbintree-2020-04-15
|
||||||
entrypoint: [""]
|
entrypoint: [""]
|
||||||
allow_failure: true
|
allow_failure: true
|
||||||
before_script:
|
before_script:
|
||||||
|
|||||||
@ -1,9 +1,4 @@
|
|||||||
// Student tasks (i.e. submit, packageSubmission)
|
// Student tasks (i.e. submit, packageSubmission)
|
||||||
enablePlugins(StudentTasks)
|
enablePlugins(StudentTasks)
|
||||||
|
|
||||||
courseraId := ch.epfl.lamp.CourseraId(
|
|
||||||
key = "7ByAoS4kEea1yxIfJA1CUw",
|
|
||||||
itemId = "QhzMw",
|
|
||||||
premiumItemId = Some("FWGnz"),
|
|
||||||
partId = "OY5fJ"
|
|
||||||
)
|
|
||||||
|
|||||||
36
build.sbt
36
build.sbt
@ -1,20 +1,24 @@
|
|||||||
course := "bigdata"
|
course := "reactive"
|
||||||
assignment := "stackoverflow"
|
assignment := "actorbintree"
|
||||||
|
|
||||||
scalaVersion := "0.24.0-RC1"
|
|
||||||
scalacOptions ++= Seq("-language:implicitConversions", "-deprecation")
|
|
||||||
libraryDependencies ++= Seq(
|
|
||||||
"com.novocode" % "junit-interface" % "0.11" % Test,
|
|
||||||
("org.apache.spark" %% "spark-core" % "3.0.0-X1").withDottyCompat(scalaVersion.value),
|
|
||||||
)
|
|
||||||
|
|
||||||
// Contains Spark 3 snapshot built against 2.13: https://github.com/smarter/spark/tree/scala-2.13
|
|
||||||
resolvers += Resolver.bintrayRepo("smarter", "maven")
|
|
||||||
|
|
||||||
testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s")
|
testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s")
|
||||||
|
parallelExecution in Test := false
|
||||||
|
|
||||||
testSuite := "stackoverflow.StackOverflowSuite"
|
val akkaVersion = "2.6.0"
|
||||||
|
|
||||||
// Without forking, ctrl-c doesn't actually fully stop Spark
|
scalaVersion := "0.23.0-bin-20200211-5b006fb-NIGHTLY"
|
||||||
fork in run := true
|
|
||||||
fork in Test := true
|
scalacOptions ++= Seq(
|
||||||
|
"-feature",
|
||||||
|
"-deprecation",
|
||||||
|
"-encoding", "UTF-8",
|
||||||
|
"-unchecked",
|
||||||
|
"-language:implicitConversions"
|
||||||
|
)
|
||||||
|
|
||||||
|
libraryDependencies ++= Seq(
|
||||||
|
"com.typesafe.akka" %% "akka-actor" % akkaVersion,
|
||||||
|
"com.typesafe.akka" %% "akka-testkit" % akkaVersion % Test,
|
||||||
|
"com.novocode" % "junit-interface" % "0.11" % Test
|
||||||
|
).map(_.withDottyCompat(scalaVersion.value))
|
||||||
|
testSuite := "actorbintree.BinaryTreeSuite"
|
||||||
|
|||||||
Binary file not shown.
189
src/main/scala/actorbintree/BinaryTreeSet.scala
Normal file
189
src/main/scala/actorbintree/BinaryTreeSet.scala
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
/**
|
||||||
|
* Copyright (C) 2009-2013 Typesafe Inc. <http://www.typesafe.com>
|
||||||
|
*/
|
||||||
|
package actorbintree
|
||||||
|
|
||||||
|
import akka.actor._
|
||||||
|
import scala.collection.immutable.Queue
|
||||||
|
|
||||||
|
object BinaryTreeSet {
|
||||||
|
|
||||||
|
trait Operation {
|
||||||
|
def requester: ActorRef
|
||||||
|
def id: Int
|
||||||
|
def elem: Int
|
||||||
|
}
|
||||||
|
|
||||||
|
trait OperationReply {
|
||||||
|
def id: Int
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Request with identifier `id` to insert an element `elem` into the tree.
|
||||||
|
* The actor at reference `requester` should be notified when this operation
|
||||||
|
* is completed.
|
||||||
|
*/
|
||||||
|
case class Insert(requester: ActorRef, id: Int, elem: Int) extends Operation
|
||||||
|
|
||||||
|
/** Request with identifier `id` to check whether an element `elem` is present
|
||||||
|
* in the tree. The actor at reference `requester` should be notified when
|
||||||
|
* this operation is completed.
|
||||||
|
*/
|
||||||
|
case class Contains(requester: ActorRef, id: Int, elem: Int) extends Operation
|
||||||
|
|
||||||
|
/** Request with identifier `id` to remove the element `elem` from the tree.
|
||||||
|
* The actor at reference `requester` should be notified when this operation
|
||||||
|
* is completed.
|
||||||
|
*/
|
||||||
|
case class Remove(requester: ActorRef, id: Int, elem: Int) extends Operation
|
||||||
|
|
||||||
|
/** Request to perform garbage collection */
|
||||||
|
case object GC
|
||||||
|
|
||||||
|
/** Holds the answer to the Contains request with identifier `id`.
|
||||||
|
* `result` is true if and only if the element is present in the tree.
|
||||||
|
*/
|
||||||
|
case class ContainsResult(id: Int, result: Boolean) extends OperationReply
|
||||||
|
|
||||||
|
/** Message to signal successful completion of an insert or remove operation. */
|
||||||
|
case class OperationFinished(id: Int) extends OperationReply
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class BinaryTreeSet extends Actor {
|
||||||
|
import BinaryTreeSet._
|
||||||
|
import BinaryTreeNode._
|
||||||
|
|
||||||
|
def createRoot: ActorRef = context.actorOf(BinaryTreeNode.props(0, initiallyRemoved = true))
|
||||||
|
|
||||||
|
var root = createRoot
|
||||||
|
|
||||||
|
// optional
|
||||||
|
var pendingQueue = Queue.empty[Operation]
|
||||||
|
|
||||||
|
// optional
|
||||||
|
def receive = normal
|
||||||
|
|
||||||
|
// optional
|
||||||
|
/** Accepts `Operation` and `GC` messages. */
|
||||||
|
val normal: Receive = {
|
||||||
|
case op:Operation => root ! op
|
||||||
|
case GC => {
|
||||||
|
val newRoot = createRoot;
|
||||||
|
root ! CopyTo(newRoot)
|
||||||
|
context.become(garbageCollecting(newRoot))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// optional
|
||||||
|
/** Handles messages while garbage collection is performed.
|
||||||
|
* `newRoot` is the root of the new binary tree where we want to copy
|
||||||
|
* all non-removed elements into.
|
||||||
|
*/
|
||||||
|
def garbageCollecting(newRoot: ActorRef): Receive = {
|
||||||
|
case op:Operation => pendingQueue = pendingQueue.enqueue(op)
|
||||||
|
case CopyFinished =>
|
||||||
|
pendingQueue.foreach(newRoot ! _) //foreach preserves order of a queue (same as dequeueing)
|
||||||
|
root ! PoisonPill //Will also stop all of its children
|
||||||
|
pendingQueue = Queue.empty
|
||||||
|
root = newRoot;
|
||||||
|
context.become(normal)
|
||||||
|
//Ignore GC messages here
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object BinaryTreeNode {
|
||||||
|
trait Position
|
||||||
|
|
||||||
|
case object Left extends Position
|
||||||
|
case object Right extends Position
|
||||||
|
|
||||||
|
case class CopyTo(treeNode: ActorRef)
|
||||||
|
case object CopyFinished
|
||||||
|
|
||||||
|
def props(elem: Int, initiallyRemoved: Boolean) = Props(classOf[BinaryTreeNode], elem, initiallyRemoved)
|
||||||
|
}
|
||||||
|
|
||||||
|
class BinaryTreeNode(val elem: Int, initiallyRemoved: Boolean) extends Actor {
|
||||||
|
import BinaryTreeNode._
|
||||||
|
import BinaryTreeSet._
|
||||||
|
|
||||||
|
var subtrees = Map[Position, ActorRef]()
|
||||||
|
var removed = initiallyRemoved
|
||||||
|
|
||||||
|
// optional
|
||||||
|
def receive = normal
|
||||||
|
|
||||||
|
def goDownTo(elem : Int) : Position = if(elem < this.elem) Left else Right
|
||||||
|
// optional
|
||||||
|
/** Handles `Operation` messages and `CopyTo` requests. */
|
||||||
|
val normal: Receive = {
|
||||||
|
case Insert (requester, id, elem) =>
|
||||||
|
if(elem == this.elem && !removed){
|
||||||
|
requester ! OperationFinished(id)
|
||||||
|
}else{
|
||||||
|
val nextPos = goDownTo(elem)
|
||||||
|
|
||||||
|
subtrees get nextPos match{
|
||||||
|
case Some(node) => node ! Insert(requester, id, elem)
|
||||||
|
case None => {
|
||||||
|
val newActorSubtree = (nextPos, context.actorOf(BinaryTreeNode.props(elem, false)))
|
||||||
|
subtrees = subtrees + newActorSubtree
|
||||||
|
requester ! OperationFinished(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case Contains(requester, id, elem) =>
|
||||||
|
if(elem == this.elem && !removed)
|
||||||
|
requester ! ContainsResult(id, true)
|
||||||
|
else{
|
||||||
|
//Need to search subtrees
|
||||||
|
subtrees get goDownTo(elem) match{
|
||||||
|
case Some(node) => node ! Contains(requester, id, elem)
|
||||||
|
case None => requester ! ContainsResult(id, false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
case Remove (requester, id, elem) =>
|
||||||
|
if(elem == this.elem && !removed){
|
||||||
|
removed = true
|
||||||
|
requester ! OperationFinished(id)
|
||||||
|
}else{
|
||||||
|
subtrees get goDownTo(elem) match{
|
||||||
|
case Some(node) => node ! Remove(requester, id, elem)
|
||||||
|
case None => requester ! OperationFinished(id) // (elem isn't in the tree)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
case CopyTo(newRoot) =>
|
||||||
|
//We are already done, nothing to do
|
||||||
|
if(removed && subtrees.isEmpty) context.parent ! CopyFinished
|
||||||
|
else{
|
||||||
|
if(!removed) newRoot ! Insert(self, elem, elem)
|
||||||
|
subtrees.values foreach(_ ! CopyTo(newRoot)) //Copy subtrees elems
|
||||||
|
//val insertConfirmed = if(removed) true else false, hence we can simply pass removed
|
||||||
|
context.become(copying(subtrees.values.toSet, removed))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// optional
|
||||||
|
/** `expected` is the set of ActorRefs whose replies we are waiting for,
|
||||||
|
* `insertConfirmed` tracks whether the copy of this node to the new tree has been confirmed.
|
||||||
|
*/
|
||||||
|
def copying(expected: Set[ActorRef], insertConfirmed: Boolean): Receive = {
|
||||||
|
//To catch the insert of this node into the new tree beeing finished
|
||||||
|
case OperationFinished(_) => {
|
||||||
|
if(expected.isEmpty) context.parent ! CopyFinished
|
||||||
|
else context.become(copying(expected, true))
|
||||||
|
}
|
||||||
|
|
||||||
|
case CopyFinished => {
|
||||||
|
val newExp = expected-sender
|
||||||
|
if(insertConfirmed && newExp.isEmpty){
|
||||||
|
context.parent ! CopyFinished
|
||||||
|
}else{
|
||||||
|
context.become(copying(newExp, insertConfirmed))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,329 +0,0 @@
|
|||||||
package stackoverflow
|
|
||||||
|
|
||||||
import org.apache.spark.SparkConf
|
|
||||||
import org.apache.spark.SparkContext
|
|
||||||
import org.apache.spark.SparkContext._
|
|
||||||
import org.apache.spark.rdd.RDD
|
|
||||||
import org.apache.log4j.{Logger, Level}
|
|
||||||
|
|
||||||
import annotation.tailrec
|
|
||||||
import scala.reflect.ClassTag
|
|
||||||
import scala.util.Properties.isWin
|
|
||||||
|
|
||||||
type Question = Posting
|
|
||||||
type Answer = Posting
|
|
||||||
type QID = Int
|
|
||||||
type HighScore = Int
|
|
||||||
type LangIndex = Int
|
|
||||||
|
|
||||||
/** A raw stackoverflow posting, either a question or an answer */
|
|
||||||
case class Posting(postingType: Int, id: Int, acceptedAnswer: Option[Int], parentId: Option[QID], score: Int, tags: Option[String]) extends Serializable
|
|
||||||
|
|
||||||
/** The main class */
|
|
||||||
object StackOverflow extends StackOverflow {
|
|
||||||
|
|
||||||
// Reduce Spark logging verbosity
|
|
||||||
Logger.getLogger("org").setLevel(Level.ERROR)
|
|
||||||
|
|
||||||
if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
|
|
||||||
|
|
||||||
@transient lazy val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
|
|
||||||
@transient lazy val sc: SparkContext = new SparkContext(conf)
|
|
||||||
|
|
||||||
/** Main function */
|
|
||||||
def main(args: Array[String]): Unit = {
|
|
||||||
|
|
||||||
val lines = sc.textFile("src/main/resources/stackoverflow/stackoverflow-grading.csv")
|
|
||||||
val raw = rawPostings(lines)
|
|
||||||
val grouped = groupedPostings(raw)
|
|
||||||
val scored = scoredPostings(grouped)
|
|
||||||
val vectors = vectorPostings(scored)
|
|
||||||
// assert(vectors.count() == 2121822, "Incorrect number of vectors: " + vectors.count())
|
|
||||||
|
|
||||||
val means = kmeans(sampleVectors(vectors), vectors, debug = true)
|
|
||||||
val results = clusterResults(means, vectors)
|
|
||||||
printResults(results)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** The parsing and kmeans methods */
|
|
||||||
class StackOverflow extends Serializable {
|
|
||||||
|
|
||||||
/** Languages */
|
|
||||||
val langs =
|
|
||||||
List(
|
|
||||||
"JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
|
|
||||||
"Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
|
|
||||||
|
|
||||||
/** K-means parameter: How "far apart" languages should be for the kmeans algorithm? */
|
|
||||||
def langSpread = 50000
|
|
||||||
assert(langSpread > 0, "If langSpread is zero we can't recover the language from the input data!")
|
|
||||||
|
|
||||||
/** K-means parameter: Number of clusters */
|
|
||||||
def kmeansKernels = 45
|
|
||||||
|
|
||||||
/** K-means parameter: Convergence criteria */
|
|
||||||
def kmeansEta: Double = 20.0D
|
|
||||||
|
|
||||||
/** K-means parameter: Maximum iterations */
|
|
||||||
def kmeansMaxIterations = 120
|
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// Parsing utilities:
|
|
||||||
//
|
|
||||||
//
|
|
||||||
|
|
||||||
/** Load postings from the given file */
|
|
||||||
def rawPostings(lines: RDD[String]): RDD[Posting] =
|
|
||||||
lines.map(line => {
|
|
||||||
val arr = line.split(",")
|
|
||||||
Posting(postingType = arr(0).toInt,
|
|
||||||
id = arr(1).toInt,
|
|
||||||
acceptedAnswer = if (arr(2) == "") None else Some(arr(2).toInt),
|
|
||||||
parentId = if (arr(3) == "") None else Some(arr(3).toInt),
|
|
||||||
score = arr(4).toInt,
|
|
||||||
tags = if (arr.length >= 6) Some(arr(5).intern()) else None)
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
/** Group the questions and answers together */
|
|
||||||
def groupedPostings(postings: RDD[Posting]): RDD[(QID, Iterable[(Question, Answer)])] = {
|
|
||||||
val questions : RDD[Question] = postings.filter(_.postingType == 1)
|
|
||||||
val answers : RDD[Answer] = postings.filter(_.postingType == 2)
|
|
||||||
val qidQuestionPair : RDD[(QID, Question)] = questions.map(q => (q.id, q))
|
|
||||||
val qidAnswerPair : RDD[(QID, Answer)] = answers.map(a => (a.parentId.get, a))
|
|
||||||
|
|
||||||
val qidQA : RDD[(QID, (Question, Answer))] = qidQuestionPair.join(qidAnswerPair)
|
|
||||||
qidQA.groupByKey
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Compute the maximum score for each posting */
|
|
||||||
def scoredPostings(grouped: RDD[(QID, Iterable[(Question, Answer)])]): RDD[(Question, HighScore)] = {
|
|
||||||
|
|
||||||
def answerHighScore(as: Array[Answer]): HighScore = {
|
|
||||||
var highScore = 0
|
|
||||||
var i = 0
|
|
||||||
while (i < as.length) {
|
|
||||||
val score = as(i).score
|
|
||||||
if (score > highScore)
|
|
||||||
highScore = score
|
|
||||||
i += 1
|
|
||||||
}
|
|
||||||
highScore
|
|
||||||
}
|
|
||||||
|
|
||||||
grouped.map{case (_, iter) =>
|
|
||||||
val question = iter.head._1;
|
|
||||||
val answers = iter map { case (q, a) => a }
|
|
||||||
(question, answerHighScore(answers.toArray))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Compute the vectors for the kmeans */
|
|
||||||
def vectorPostings(scored: RDD[(Question, HighScore)]): RDD[(LangIndex, HighScore)] = {
|
|
||||||
/** Return optional index of first language that occurs in `tags`. */
|
|
||||||
def firstLangInTag(tag: Option[String], ls: List[String]): Option[Int] = {
|
|
||||||
if (tag.isEmpty) None
|
|
||||||
else if (ls.isEmpty) None
|
|
||||||
else if (tag.get == ls.head) Some(0) // index: 0
|
|
||||||
else {
|
|
||||||
val tmp = firstLangInTag(tag, ls.tail)
|
|
||||||
tmp match {
|
|
||||||
case None => None
|
|
||||||
case Some(i) => Some(i + 1) // index i in ls.tail => index i+1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
scored.map{ case (q, hs) => (firstLangInTag(q.tags, langs).get * langSpread, hs)}.persist()
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Sample the vectors */
|
|
||||||
def sampleVectors(vectors: RDD[(LangIndex, HighScore)]): Array[(Int, Int)] = {
|
|
||||||
|
|
||||||
assert(kmeansKernels % langs.length == 0, "kmeansKernels should be a multiple of the number of languages studied.")
|
|
||||||
val perLang = kmeansKernels / langs.length
|
|
||||||
|
|
||||||
// http://en.wikipedia.org/wiki/Reservoir_sampling
|
|
||||||
def reservoirSampling(lang: Int, iter: Iterator[Int], size: Int): Array[Int] = {
|
|
||||||
val res = new Array[Int](size)
|
|
||||||
val rnd = new util.Random(lang)
|
|
||||||
|
|
||||||
for (i <- 0 until size) {
|
|
||||||
assert(iter.hasNext, s"iterator must have at least $size elements")
|
|
||||||
res(i) = iter.next
|
|
||||||
}
|
|
||||||
|
|
||||||
var i = size.toLong
|
|
||||||
while (iter.hasNext) {
|
|
||||||
val elt = iter.next
|
|
||||||
val j = math.abs(rnd.nextLong) % i
|
|
||||||
if (j < size)
|
|
||||||
res(j.toInt) = elt
|
|
||||||
i += 1
|
|
||||||
}
|
|
||||||
|
|
||||||
res
|
|
||||||
}
|
|
||||||
|
|
||||||
val res =
|
|
||||||
if (langSpread < 500)
|
|
||||||
// sample the space regardless of the language
|
|
||||||
vectors.takeSample(false, kmeansKernels, 42)
|
|
||||||
else
|
|
||||||
// sample the space uniformly from each language partition
|
|
||||||
vectors.groupByKey.flatMap({
|
|
||||||
case (lang, vectors) => reservoirSampling(lang, vectors.iterator, perLang).map((lang, _))
|
|
||||||
}).collect()
|
|
||||||
|
|
||||||
assert(res.length == kmeansKernels, res.length)
|
|
||||||
res
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// Kmeans method:
|
|
||||||
//
|
|
||||||
//
|
|
||||||
|
|
||||||
/** Main kmeans computation */
|
|
||||||
@tailrec final def kmeans(means: Array[(Int, Int)], vectors: RDD[(Int, Int)], iter: Int = 1, debug: Boolean = false): Array[(Int, Int)] = {
|
|
||||||
val newMeans = means.clone() // you need to compute newMeans
|
|
||||||
|
|
||||||
// TODO: Fill in the newMeans array
|
|
||||||
val distance = euclideanDistance(means, newMeans)
|
|
||||||
|
|
||||||
if (debug) {
|
|
||||||
println(s"""Iteration: $iter
|
|
||||||
| * current distance: $distance
|
|
||||||
| * desired distance: $kmeansEta
|
|
||||||
| * means:""".stripMargin)
|
|
||||||
for (idx <- 0 until kmeansKernels)
|
|
||||||
println(f" ${means(idx).toString}%20s ==> ${newMeans(idx).toString}%20s " +
|
|
||||||
f" distance: ${euclideanDistance(means(idx), newMeans(idx))}%8.0f")
|
|
||||||
}
|
|
||||||
|
|
||||||
if (converged(distance))
|
|
||||||
newMeans
|
|
||||||
else if (iter < kmeansMaxIterations)
|
|
||||||
kmeans(newMeans, vectors, iter + 1, debug)
|
|
||||||
else {
|
|
||||||
if (debug) {
|
|
||||||
println("Reached max iterations!")
|
|
||||||
}
|
|
||||||
newMeans
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// Kmeans utilities:
|
|
||||||
//
|
|
||||||
//
|
|
||||||
|
|
||||||
/** Decide whether the kmeans clustering converged */
|
|
||||||
def converged(distance: Double) =
|
|
||||||
distance < kmeansEta
|
|
||||||
|
|
||||||
|
|
||||||
/** Return the euclidean distance between two points */
|
|
||||||
def euclideanDistance(v1: (Int, Int), v2: (Int, Int)): Double = {
|
|
||||||
val part1 = (v1._1 - v2._1).toDouble * (v1._1 - v2._1)
|
|
||||||
val part2 = (v1._2 - v2._2).toDouble * (v1._2 - v2._2)
|
|
||||||
part1 + part2
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Return the euclidean distance between two points */
|
|
||||||
def euclideanDistance(a1: Array[(Int, Int)], a2: Array[(Int, Int)]): Double = {
|
|
||||||
assert(a1.length == a2.length)
|
|
||||||
var sum = 0d
|
|
||||||
var idx = 0
|
|
||||||
while(idx < a1.length) {
|
|
||||||
sum += euclideanDistance(a1(idx), a2(idx))
|
|
||||||
idx += 1
|
|
||||||
}
|
|
||||||
sum
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Return the closest point */
|
|
||||||
def findClosest(p: (Int, Int), centers: Array[(Int, Int)]): Int = {
|
|
||||||
var bestIndex = 0
|
|
||||||
var closest = Double.PositiveInfinity
|
|
||||||
for (i <- 0 until centers.length) {
|
|
||||||
val tempDist = euclideanDistance(p, centers(i))
|
|
||||||
if (tempDist < closest) {
|
|
||||||
closest = tempDist
|
|
||||||
bestIndex = i
|
|
||||||
}
|
|
||||||
}
|
|
||||||
bestIndex
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Average the vectors */
|
|
||||||
def averageVectors(ps: Iterable[(Int, Int)]): (Int, Int) = {
|
|
||||||
val iter = ps.iterator
|
|
||||||
var count = 0
|
|
||||||
var comp1: Long = 0
|
|
||||||
var comp2: Long = 0
|
|
||||||
while (iter.hasNext) {
|
|
||||||
val item = iter.next
|
|
||||||
comp1 += item._1
|
|
||||||
comp2 += item._2
|
|
||||||
count += 1
|
|
||||||
}
|
|
||||||
((comp1 / count).toInt, (comp2 / count).toInt)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// Displaying results:
|
|
||||||
//
|
|
||||||
//
|
|
||||||
def clusterResults(means: Array[(Int, Int)], vectors: RDD[(LangIndex, HighScore)]): Array[(String, Double, Int, Int)] = {
|
|
||||||
val closest = vectors.map(p => (findClosest(p, means), p))
|
|
||||||
val closestGrouped = closest.groupByKey()
|
|
||||||
|
|
||||||
val median = closestGrouped.mapValues { vs =>
|
|
||||||
val dominantLangIndex = vs.groupBy(_._1).maxBy(_._2.size)._1 / langSpread
|
|
||||||
val langLabel: String = langs(dominantLangIndex)// most common language in the cluster
|
|
||||||
|
|
||||||
val nbOfDominant = vs.count(_._1 / langSpread == dominantLangIndex)
|
|
||||||
val langPercent: Double = 100.0 * (nbOfDominant.toDouble / vs.size) // percent of the questions in the most common language
|
|
||||||
val clusterSize: Int = vs.size
|
|
||||||
val medianScore: Int = {
|
|
||||||
val highscores = vs.map{ case (li, hs) => hs }
|
|
||||||
val sortedHighscores = highscores.toArray.sorted;
|
|
||||||
val mid = highscores.size / 2;
|
|
||||||
highscores.size % 2 match{
|
|
||||||
case 1 => sortedHighscores(mid)
|
|
||||||
case 0 => (sortedHighscores(mid-1) + sortedHighscores(mid)) / 2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(langLabel, langPercent, clusterSize, medianScore)
|
|
||||||
}
|
|
||||||
|
|
||||||
median.collect().map(_._2).sortBy(_._4)
|
|
||||||
}
|
|
||||||
|
|
||||||
def printResults(results: Array[(String, Double, Int, Int)]): Unit = {
|
|
||||||
println("Resulting clusters:")
|
|
||||||
println(" Score Dominant language (%percent) Questions")
|
|
||||||
println("================================================")
|
|
||||||
for ((lang, percent, size, score) <- results)
|
|
||||||
println(f"${score}%7d ${lang}%-17s (${percent}%-5.1f%%) ${size}%7d")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
126
src/test/scala/actorbintree/BinaryTreeSuite.scala
Normal file
126
src/test/scala/actorbintree/BinaryTreeSuite.scala
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
/**
|
||||||
|
* Copyright (C) 2009-2015 Typesafe Inc. <http://www.typesafe.com>
|
||||||
|
*/
|
||||||
|
package actorbintree
|
||||||
|
|
||||||
|
import akka.actor.{ActorRef, ActorSystem, Props, actorRef2Scala, scala2ActorRef}
|
||||||
|
import akka.testkit.{ImplicitSender, TestKit, TestProbe}
|
||||||
|
import org.junit.Test
|
||||||
|
import org.junit.Assert._
|
||||||
|
|
||||||
|
import scala.util.Random
|
||||||
|
import scala.concurrent.duration._
|
||||||
|
|
||||||
|
class BinaryTreeSuite extends TestKit(ActorSystem("BinaryTreeSuite")) with ImplicitSender {
|
||||||
|
|
||||||
|
import actorbintree.BinaryTreeSet._
|
||||||
|
|
||||||
|
def receiveN(requester: TestProbe, ops: Seq[Operation], expectedReplies: Seq[OperationReply]): Unit =
|
||||||
|
requester.within(5.seconds) {
|
||||||
|
val repliesUnsorted = for (i <- 1 to ops.size) yield try {
|
||||||
|
requester.expectMsgType[OperationReply]
|
||||||
|
} catch {
|
||||||
|
case ex: Throwable if ops.size > 10 => sys.error(s"failure to receive confirmation $i/${ops.size}\n$ex")
|
||||||
|
case ex: Throwable => sys.error(s"failure to receive confirmation $i/${ops.size}\nRequests:" + ops.mkString("\n ", "\n ", "") + s"\n$ex")
|
||||||
|
}
|
||||||
|
val replies = repliesUnsorted.sortBy(_.id)
|
||||||
|
if (replies != expectedReplies) {
|
||||||
|
val pairs = (replies zip expectedReplies).zipWithIndex filter (x => x._1._1 != x._1._2)
|
||||||
|
fail("unexpected replies:" + pairs.map(x => s"at index ${x._2}: got ${x._1._1}, expected ${x._1._2}").mkString("\n ", "\n ", ""))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def verify(probe: TestProbe, ops: Seq[Operation], expected: Seq[OperationReply]): Unit = {
|
||||||
|
val topNode = system.actorOf(Props[BinaryTreeSet])
|
||||||
|
|
||||||
|
ops foreach { op =>
|
||||||
|
topNode ! op
|
||||||
|
}
|
||||||
|
|
||||||
|
receiveN(probe, ops, expected)
|
||||||
|
// the grader also verifies that enough actors are created
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test def `proper inserts and lookups (5pts)`(): Unit = {
|
||||||
|
val topNode = system.actorOf(Props[BinaryTreeSet])
|
||||||
|
|
||||||
|
topNode ! Contains(testActor, id = 1, 1)
|
||||||
|
expectMsg(ContainsResult(1, false))
|
||||||
|
|
||||||
|
topNode ! Insert(testActor, id = 2, 1)
|
||||||
|
topNode ! Contains(testActor, id = 3, 1)
|
||||||
|
|
||||||
|
expectMsg(OperationFinished(2))
|
||||||
|
expectMsg(ContainsResult(3, true))
|
||||||
|
()
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test def `instruction example (5pts)`(): Unit = {
|
||||||
|
val requester = TestProbe()
|
||||||
|
val requesterRef = requester.ref
|
||||||
|
val ops = List(
|
||||||
|
Insert(requesterRef, id=100, 1),
|
||||||
|
Contains(requesterRef, id=50, 2),
|
||||||
|
Remove(requesterRef, id=10, 1),
|
||||||
|
Insert(requesterRef, id=20, 2),
|
||||||
|
Contains(requesterRef, id=80, 1),
|
||||||
|
Contains(requesterRef, id=70, 2)
|
||||||
|
)
|
||||||
|
|
||||||
|
val expectedReplies = List(
|
||||||
|
OperationFinished(id=10),
|
||||||
|
OperationFinished(id=20),
|
||||||
|
ContainsResult(id=50, false),
|
||||||
|
ContainsResult(id=70, true),
|
||||||
|
ContainsResult(id=80, false),
|
||||||
|
OperationFinished(id=100)
|
||||||
|
)
|
||||||
|
|
||||||
|
verify(requester, ops, expectedReplies)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test def `behave identically to built-in set (includes GC) (40pts)`(): Unit = {
|
||||||
|
val rnd = new Random()
|
||||||
|
def randomOperations(requester: ActorRef, count: Int): Seq[Operation] = {
|
||||||
|
def randomElement: Int = rnd.nextInt(100)
|
||||||
|
def randomOperation(requester: ActorRef, id: Int): Operation = rnd.nextInt(4) match {
|
||||||
|
case 0 => Insert(requester, id, randomElement)
|
||||||
|
case 1 => Insert(requester, id, randomElement)
|
||||||
|
case 2 => Contains(requester, id, randomElement)
|
||||||
|
case 3 => Remove(requester, id, randomElement)
|
||||||
|
}
|
||||||
|
|
||||||
|
for (seq <- 0 until count) yield randomOperation(requester, seq)
|
||||||
|
}
|
||||||
|
|
||||||
|
def referenceReplies(operations: Seq[Operation]): Seq[OperationReply] = {
|
||||||
|
var referenceSet = Set.empty[Int]
|
||||||
|
def replyFor(op: Operation): OperationReply = op match {
|
||||||
|
case Insert(_, seq, elem) =>
|
||||||
|
referenceSet = referenceSet + elem
|
||||||
|
OperationFinished(seq)
|
||||||
|
case Remove(_, seq, elem) =>
|
||||||
|
referenceSet = referenceSet - elem
|
||||||
|
OperationFinished(seq)
|
||||||
|
case Contains(_, seq, elem) =>
|
||||||
|
ContainsResult(seq, referenceSet(elem))
|
||||||
|
}
|
||||||
|
|
||||||
|
for (op <- operations) yield replyFor(op)
|
||||||
|
}
|
||||||
|
|
||||||
|
val requester = TestProbe()
|
||||||
|
val topNode = system.actorOf(Props[BinaryTreeSet])
|
||||||
|
val count = 1000
|
||||||
|
|
||||||
|
val ops = randomOperations(requester.ref, count)
|
||||||
|
val expectedReplies = referenceReplies(ops)
|
||||||
|
|
||||||
|
ops foreach { op =>
|
||||||
|
topNode ! op
|
||||||
|
if (rnd.nextDouble() < 0.1) topNode ! GC
|
||||||
|
}
|
||||||
|
receiveN(requester, ops, expectedReplies)
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,47 +0,0 @@
|
|||||||
package stackoverflow
|
|
||||||
|
|
||||||
import org.apache.spark.SparkConf
|
|
||||||
import org.apache.spark.SparkContext
|
|
||||||
import org.apache.spark.SparkContext._
|
|
||||||
import org.apache.spark.rdd.RDD
|
|
||||||
import org.junit._
|
|
||||||
import org.junit.Assert.assertEquals
|
|
||||||
import java.io.File
|
|
||||||
import scala.io.{ Codec, Source }
|
|
||||||
import scala.util.Properties.isWin
|
|
||||||
|
|
||||||
object StackOverflowSuite {
|
|
||||||
if (isWin) System.setProperty("hadoop.home.dir", System.getProperty("user.dir") + "\\winutils\\hadoop-2.7.4")
|
|
||||||
|
|
||||||
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("StackOverflow")
|
|
||||||
val sc: SparkContext = new SparkContext(conf)
|
|
||||||
}
|
|
||||||
|
|
||||||
class StackOverflowSuite {
|
|
||||||
import StackOverflowSuite._
|
|
||||||
|
|
||||||
|
|
||||||
lazy val testObject = new StackOverflow {
|
|
||||||
override val langs =
|
|
||||||
List(
|
|
||||||
"JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
|
|
||||||
"Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy")
|
|
||||||
override def langSpread = 50000
|
|
||||||
override def kmeansKernels = 45
|
|
||||||
override def kmeansEta: Double = 20.0D
|
|
||||||
override def kmeansMaxIterations = 120
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test def `testObject can be instantiated`: Unit = {
|
|
||||||
val instantiatable = try {
|
|
||||||
testObject
|
|
||||||
true
|
|
||||||
} catch {
|
|
||||||
case _: Throwable => false
|
|
||||||
}
|
|
||||||
assert(instantiatable, "Can't instantiate a StackOverflow object")
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Rule def individualTestTimeout = new org.junit.rules.Timeout(300 * 1000)
|
|
||||||
}
|
|
||||||
Binary file not shown.
Loading…
Reference in New Issue
Block a user