From f45198723a0bd9dd8db04ddbc70ea7296d2485dc Mon Sep 17 00:00:00 2001 From: Charles Beauvile Date: Thu, 21 May 2020 12:23:44 +0200 Subject: [PATCH] 2 failures --- .../scala/stackoverflow/StackOverflow.scala | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/main/scala/stackoverflow/StackOverflow.scala b/src/main/scala/stackoverflow/StackOverflow.scala index 42a75b7..e93b7ff 100644 --- a/src/main/scala/stackoverflow/StackOverflow.scala +++ b/src/main/scala/stackoverflow/StackOverflow.scala @@ -90,7 +90,13 @@ class StackOverflow extends Serializable { /** Group the questions and answers together */ def groupedPostings(postings: RDD[Posting]): RDD[(QID, Iterable[(Question, Answer)])] = { - ??? + val questions : RDD[Question] = postings.filter(_.postingType == 1) + val answers : RDD[Answer] = postings.filter(_.postingType == 2) + val qidQuestionPair : RDD[(QID, Question)] = questions.map(q => (q.id, q)) + val qidAnswerPair : RDD[(QID, Answer)] = answers.map(a => (a.parentId.get, a)) + + val qidQA : RDD[(QID, (Question, Answer))] = qidQuestionPair.join(qidAnswerPair) + qidQA.groupByKey } @@ -109,7 +115,11 @@ class StackOverflow extends Serializable { highScore } - ??? + grouped.map{case (_, iter) => + val question = iter.head._1; + val answers = iter map { case (q, a) => a } + (question, answerHighScore(answers.toArray)) + } } @@ -129,7 +139,7 @@ class StackOverflow extends Serializable { } } - ??? + scored.map{ case (q, hs) => (firstLangInTag(q.tags, langs).get * langSpread, hs)}.persist() } @@ -287,10 +297,21 @@ class StackOverflow extends Serializable { val closestGrouped = closest.groupByKey() val median = closestGrouped.mapValues { vs => - val langLabel: String = ??? // most common language in the cluster - val langPercent: Double = ??? // percent of the questions in the most common language - val clusterSize: Int = ??? - val medianScore: Int = ??? + val dominantLangIndex = vs.groupBy(_._1).maxBy(_._2.size)._1 / langSpread + val langLabel: String = langs(dominantLangIndex)// most common language in the cluster + + val nbOfDominant = vs.count(_._1 / langSpread == dominantLangIndex) + val langPercent: Double = 100.0 * (nbOfDominant.toDouble / vs.size) // percent of the questions in the most common language + val clusterSize: Int = vs.size + val medianScore: Int = { + val highscores = vs.map{ case (li, hs) => hs } + val sortedHighscores = highscores.toArray.sorted; + val mid = highscores.size / 2; + highscores.size % 2 match{ + case 1 => sortedHighscores(mid) + case 0 => (sortedHighscores(mid-1) + sortedHighscores(mid)) / 2 + } + } (langLabel, langPercent, clusterSize, medianScore) }