From e408ad50aec693fddf9d462f00390a93241017d5 Mon Sep 17 00:00:00 2001 From: Guillaume Martres Date: Tue, 19 Feb 2019 20:44:23 +0100 Subject: [PATCH] Add wikipedia assignment --- .gitignore | 20 ++ .gitlab-ci.yml | 36 ++ .vscode/settings.json | 8 + assignment.sbt | 9 + build.sbt | 20 ++ grading-tests.jar | Bin 0 -> 16739 bytes project/MOOCSettings.scala | 46 +++ project/StudentTasks.scala | 318 ++++++++++++++++++ project/build.properties | 1 + project/buildSettings.sbt | 5 + project/plugins.sbt | 2 + src/main/resources/wikipedia/.keep | 0 src/main/scala/wikipedia/WikipediaData.scala | 21 ++ .../scala/wikipedia/WikipediaRanking.scala | 96 ++++++ .../wikipedia/WikipediaRankingInterface.scala | 20 ++ src/test/scala/wikipedia/WikipediaSuite.scala | 142 ++++++++ 16 files changed, 744 insertions(+) create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 .vscode/settings.json create mode 100644 assignment.sbt create mode 100644 build.sbt create mode 100644 grading-tests.jar create mode 100644 project/MOOCSettings.scala create mode 100644 project/StudentTasks.scala create mode 100644 project/build.properties create mode 100644 project/buildSettings.sbt create mode 100644 project/plugins.sbt create mode 100644 src/main/resources/wikipedia/.keep create mode 100644 src/main/scala/wikipedia/WikipediaData.scala create mode 100644 src/main/scala/wikipedia/WikipediaRanking.scala create mode 100644 src/main/scala/wikipedia/WikipediaRankingInterface.scala create mode 100644 src/test/scala/wikipedia/WikipediaSuite.scala diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..996b5d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +# General +*.DS_Store +*.swp +*~ + +# Dotty +*.class +*.tasty +*.hasTasty + +# sbt +target/ + +# Dotty IDE +/.dotty-ide-artifact +/.dotty-ide.json + +# datasets +stackoverflow-grading.csv +wikipedia-grading.dat diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..e5ec826 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,36 @@ +# DO NOT EDIT THIS FILE + +stages: + - build + - grade + +compile: + stage: build + image: lampepfl/moocs:dotty-2020-02-12 + except: + - tags + tags: + - cs206 + script: + - sbt packageSubmission + artifacts: + expire_in: 1 day + paths: + - submission.jar + +grade: + stage: grade + except: + - tags + tags: + - cs206 + image: + name: smarter3/moocs:bigdata-wikipedia-2020-04-30-3 + entrypoint: [""] + allow_failure: true + before_script: + - mkdir -p /shared/submission/ + - cp submission.jar /shared/submission/submission.jar + script: + - cd /grader + - /grader/grade | /grader/feedback-printer diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a35362b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "dotty": { + "trace": { + "remoteTracingUrl": "wss://lamppc36.epfl.ch/dotty-remote-tracer/upload/lsp.log", + "server": { "format": "JSON", "verbosity": "verbose" } + } + } +} diff --git a/assignment.sbt b/assignment.sbt new file mode 100644 index 0000000..4dd1761 --- /dev/null +++ b/assignment.sbt @@ -0,0 +1,9 @@ +// Student tasks (i.e. submit, packageSubmission) +enablePlugins(StudentTasks) + +courseraId := ch.epfl.lamp.CourseraId( + key = "EH8wby4kEeawURILfHIqjw", + itemId = "CfQX2", + premiumItemId = Some("QcWcs"), + partId = "5komc" +) diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..d35eb5a --- /dev/null +++ b/build.sbt @@ -0,0 +1,20 @@ +course := "bigdata" +assignment := "wikipedia" + +scalaVersion := "0.24.0-RC1" +scalacOptions ++= Seq("-language:implicitConversions", "-deprecation") +libraryDependencies ++= Seq( + "com.novocode" % "junit-interface" % "0.11" % Test, + ("org.apache.spark" %% "spark-core" % "3.0.0-X1").withDottyCompat(scalaVersion.value), +) + +// Contains Spark 3 snapshot built against 2.13: https://github.com/smarter/spark/tree/scala-2.13 +resolvers += Resolver.bintrayRepo("smarter", "maven") + +testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-a", "-v", "-s") + +testSuite := "wikipedia.WikipediaSuite" + +// Without forking, ctrl-c doesn't actually fully stop Spark +fork in run := true +fork in Test := true diff --git a/grading-tests.jar b/grading-tests.jar new file mode 100644 index 0000000000000000000000000000000000000000..0ab663e974b78775f3428b084e5777e66d40f34e GIT binary patch literal 16739 zcmbWfWo#uew=EcEW@g3?b(lFP%pK+qGh>IDnVECK%*@Qp%-G>%`g>2Kd#`5R{Fq%O zm8)cLdr4KcReP@Xx2x>d9CDfBUKTGg^pnu}0Z`2|@cQn}(*)4XkIXSD;=vaxzn-`K9$v^&n?_xkB| zAGiFsxvO_Ao9!ew{rm3o`-AOwN~s{%3k>QM*7q>GuXZi{Y9VC6*SSl%GJp(cC2LSt zJo8igbBWZiwa{~^WJ*hWQ<_`0kd3xPEf) zB}WQuXTgb_kf<9hNyD;3f{+pA_5A%q&OAztx8jrkt63h%vwRTiswN5Ue}4StsTey= zP$1r9>NtrU5_qSQ?l?Za;#n3Z(MFrvDLaBN=~132!N#4bzEoqOqdd-TfkF$g;%848 zK3c%RyUF)nOkDNS+5 z$X-d|M4H7bd^3;8q*`#0HEsxR1Zs^+@ABJ8J3UmGkqnklO9#cs7bV`GLC^)(<+G~6 zKP4e93RTnv&>gL+%}b5PkX+omna^&rgA}tgD3_ai3aFt+rO^W?RJGIvFd$4iez9*v zyE^s8$rH*29rW1(+r_szsQZDfePz@DdPI?YO)kpZVjtaK-4po@p}*gIrs^tZrAUUx##VDgxXM(Z>l_c)#QC=SZ;H!;C-YjZ$Pk@I)Z*j85ITv8GOQa4&k6RW|v{-XN z3rS-=A{^L&x6M#A+dHz;G)6hw@A)Crx34H1b|JnXQ+gD$E@ zyiZ=wl{ZlqDF;v$=>rf2OfBZ|Quwn2CdV@eGc~V{h%jm`(k8CM*da?Buo*{WU6EYp z!cYMJAxY`smt!sC8r+*`AEDN_RGG*nRLQ>cZ(#P^=+AOAYF!(iwevEp*>%!h4hw3w zbBvAL@lva^1$)-p&7vWrFp{uq)1x(6&(j9y>`d4BCm|i=iv9c_3pn_x@hlfm?Q_vN zg0V`NHUNUXa*hbtaZ%1O3qxuj(653%31d7PlRvuhVAunhw{V^X>&tn~a>%Yu(BhdH z*Lc>4#yrNpx^m6zrATXAAIt6QD$u73sPp}Vck-syP{>#L+FMC^8-Lo}#i;LR^~zTu zwm?6gZVZ*kaNtJIE-G!?0dhsZI!}WtJt<^vqSnfy^@xvrMGXe>-@04x4bl^vR6tTOz z((?j6Pf7R>Lr_MC2w=9fxPHtA3H%Y58zOg>s(Ad-UUn~+`4hM)zLSJm3uY`~b~}KG zyOy;bn3w}$!uW?inE4ew-~CLMiupr3a#;IgYds;3DmiGk!Ni;oNm{WxMfyE*mz$4{ zTb?M6R&>Q7kQy=1AMZObC$ZK&=WMmlBKwh0R-|~V&z`Krk9BF|b{nK6#YR1NZ(WoH zL^d$=yXXdM7XFcJU-?IKQ3v(*MgGm4TK#p80euVe)5!+Ao&7hWH^|GH8I-5{g{aON zlDwa0f{NWQS^`}%J?F#@hznSA__rt4QpqWA1M&7=WI7QDc4sI~R-3lnbr0n`vv$RB z+YHY0HxZ}p1XZrJ8Qe^AxCnO9&SMreUMjsiMMIxq-FH%rE7CoZX6ML5j4raNzdq;8 z;R2?m4AKP&U@Yq9hoy?>=Zpuog%Gv9yc)GO8YwILjQnU8$x7Xjy8wRn3%rjbBG8}V zXIb_BD*lB^McXD8WL0>w8owrTyJok%(o|d)7@wd~73n7G%_rWEX6eaC)gQ&Dp`8ci zYRDkwS}(0RYoRRs6Cp)jGLjL2tZcY{%9aP(vQlTbFLA)6+4wJ&L3Texs5`oFaM4(Vk9B|^!qCZW$r#zsvgg30B0ezp5ToPXRk zPcNDrG-aE7jw|)qc@Dz1Hv*a|0<=HVf(~5vdMjv*dc56tW`)_v$8rT}q*UVFxs?8% zwkP8*u{~C?&#+kTBZctsg>MU;nle~ZBjSl?Uy8ofk*^N0Tb=&N%np5U@a`liYur(_ zJMby*9inE<{!t6c3btBSKl!Bk3Wc2N(3aFxFx!_bt`jqjLd@21<2n#sky=V%8R9IE zb1Xh!X&|zJq{`~x@GZ!+E@W4Fx!4t{r^Rr};br88+j^RJ$2 zMw)=n@mPFwvlx4Ya7&%qRrGAU9#Eikgoiv_QHa?tQmo@q*^8nAd-0?b2ZhiYxw*VU z<&k>o=}Dr}pi_leygx&4)_Er$AKX#F&aX;Sj<8||jinITj~yHt1E-6P1(D2-nF z+P{M^qLe(^C<$%h8tvs7b?vzZ14_E=MAifIku$YBm#jIdJZ+|b7^^XYC*j@Y1X}9H zhXU^6j`!K(dB^x6cO3mv1@WFcOgeQ;67Scil)#ZoP`py`jeV?d#hVxX6K4!72#1vg zyg&D^i>VPUrKnpb#5ReCRBOg49eGfn?WtLLuXhbRzZTbz zgx=6Pmv%rc+8NM)f0vHKY`tp21}!aa`622H2#Hb}$Kqc#kD6L!aF=k1T2>q8$=i@* zuSY!DM;$0HLXcUcaI&dw7@G8M*NPq4bn1Fa-h3D1fJWw&j)Xm+k~7^Z=OEN^8)*Al zhTR`PCyb5d8t`cU=8mW(g$(n7fwg{#vWqmi4c`kSz5>CWyI0mlPZdT_^%^7P=ws9I z#!RqpqCSNm_9j%CB6hoz=gTzoMsuKB^oI zR7_$=j+O!nl=+#u;Yl?^!@WyDAb@XVhWec<4~+U7hx;9%wZE9Y@W9$(%j_wxFQmnq z6l<81i(W;Ux{+U4gMVv1r|wzB{dsZe)(5i^oZQ5c9+jmjt@3UQCEkVJ@`fcICI3WT zR$%H8Kq$Y-4KO=<9yCR@sEkf9<$_AY2Iot-f*1 zzke|w0j_wJb6yEHZJQ%|rc}4#G?GX+S1=I33Z2;LSNk zhIYhLvN8~R>lG4}@~H8mEV}=ApWvL$6|i9U1QjP{6Y+iMdmerXES&f7iOwGSW|xzf zMEhMau_e8U>Xr)zDEnPfwy2&p4!+XNq}I`+tr4Lbh3G7Ptd}W0)^C@$ zAyi3TL^(9Nli|fY;SX?K7{=jHv&?Tz0T3}}jcl~I3} zcFpFo!yWNVTp8K6lCPYR(Yi;;t*@M%yz#EwS4r$XOXv8~C?l*s%#QPl80ky2C>nx(`<@q%nDPGyR` zQP$Qg!iDQiMNXPZk^v|d{Pir~K2Cd+kSq%NceypV3qj?E8LRWn!#s_8XhQqZwCtjk zfs-jz$dl8tJo^IuXF!S%>lZR{ciQ&b3JUvxslvRW5|n0=*BHIwoL>SS$>(?I3i!gt zH(7$2rgw$yokgvCQ))!<(8H8*&Asequc6zQLB}bp&%{|AGyll?r6MD<{xxdD5CNwy z&GC4k7tjR=xYcd$6c$LQmXBAkRNg@2Cc+aYf9Naalk8}%4lW<8UO7TUJi_NQ$}dZt z!ZvJ55$LnRY}do~gbmU=FP>9AWl5JLn*w+9DG-(?pQxK|@tC^Ch?a#^I)EWPM)ENF z9ky~;UOgmJH!imlv2=VVH<6TdUb9G>5l?FRdzH$6GX)#?S|DC!NSAb?+N_C;vX;F= zaK41(u1!7dTsZMGx{%jXOzH={;Le*YdSJzGRLBLe{4ka+Y<>;ATcam;XluKQr6;n+ zXVO5NK|L7{@huU2eCQU|WopXItFq#={2qCfFuP->^9TVzIj1h}CS2*^4R8>#Bz`kg z1`r*_&~4oq9T#RMeTO#=U%sZU2tzsc^Ei?T$;YM+lcoLkShW9r??;Bo68qvuX|hXE zeLw-4W&JeI#c>8bhtx^^LF$1g_N7CX5H_)slcbFlA%;%MsBXhoHN_`3pQU(GKY}Q9 zg>iC%qTh@^QC<)A^g!$oudPEOFgi@ zY}Ne;%c^e~**ScImOHr4O%$JLL*#(lqbtN5Hc?juP?<0m55|QaF`1ezgfT6kKR)lglUE4*?May+vP`<-8*PKyMr3MttNiEV=Ckke_&oNeG z=LWKp+t{!@F(afl=pykjPWeV~wmJ&v)$G0U$Q;^?2&90ax9NOrTYG z0<_mwu@wFObRA6HVBUX!0NLGmz1w5^ufJ;XR)mTdp)3Oo2ef@m%ULSC%dx|cXlZr# ze-_OZ{|Lrf^6Sqh&SEs>Svxkj21L;1C9avy9iwJHGAmM8_;|vUwxXYIVuTMG-<1)I z=st6D1zRm#?MgZh=^Eu~d#i0mpE@JgpDOyX@a^k4gr*L|zfQMOuA=@OAj0!DaG=N= z+o^*XSRWUArea*DTX6TFV+R|mdMf0~%PuxC@quPRWCmYD~_Y^?uU%B-3>al3(%9tMY1s)h~C3FP!1uhu; z@fG&t5lw}siWU8o^wR|5EyZ*Zs+ow(cuS{M1x;qbmd>tomJMY3>RH-{Z1kJ;Mv27* zLh29^!wzvlE*&DEpgFtf$Ury}C(jhvMHmEw!+6O2M>>vEeB^YFH_^D~YM8d%vI zn#%-4S@0uDexDvq7K|e_HDalmSRPdsjHSzPcbull7FRLhE$P#~ekLdE=+jQi7F)ZT zo)>MUmnXChbdsk@j{(;#thFBWr(%R^R-9gow}|Id6h?A=RKUs zx2JT=beu7@$Nrt;JhSx_{;KuwmbjViy3BibCwRJJUmsvQHO-L}Jj8lKZJCSZmJr+n zoR)eMen#09=RWwpK?=(E9HyRP`bO&=3C>yd%k`k4pSmas+l2)`)@#c4U`g8_&#@@* zp7L$B^@Z*s@A67sg$s>GcJ;r9Q|SJ6X%~?hmSGLGF84l`X&*Ol0QaT5&NtG`Osofgf1jE zrcf#3pbn=m_&xIsP*EBVb;B?N#Pe1~zbI7QR2bQ`!V~@NA**Fy8*kZ}*^NNA_QJX- zGp7JG(xR{_YU{nzmP$ud(6KBi^!|1#K)X0daV6=KfOJT8wnyPJ@J=I;@R&v*pE@C3 zP`f3qP+qqs!!jb8rp(c;KspYpY}duHO9l>@@;nf3;Eo&#mz+*g7ZaD>tJ{DgkN z6{*QxqB%3$(60GGI4cdj4&>U=`xP9Uu~|;p27O8?9C{X<(V4JWejHs$aRcI$Yf;@W zn2)W~z8Qkn$t=1WQ4vA$Nh2*7KmRSOu3AE$2YX5--BUG2e9##_{4{rgN$ElP3Vr5N zK=D9YAnJ3e!#SMndD<)b*MzTgX7MOJ$t#E{l?;?tG7p=x@uDc`?K$8z_U?tYfIN*n zM7n~#HS2IjqJ;MFaL$>;jw*WO!yar(_(~`35JO9-Nj3Hcs=|L|c^+%gdT1%~LaQRs zZe3i5P zIcSe|)!R$m!{BLT$19L?B87bzH9VdVp^BN=ydiOXd!Ud7rCbRW57MN3Ar*Y{@ARk{ zR@_`u(=6&gcdsy9JVK=f(A<6czv2%UY*}%tW(-giQiygXN=oRsPi`$Ix!aJP*h&cW zG$t=1J&Y4()brCekQ$&Mt*;NHSU9P9p0VGA02mHk`&u6mK`um45!QK#aG||mE<@8TMh|Y50U}0MJP5tA*UI~E$P5Q;w zke~BfnqGvI_BN8A)E3$0{n6~nf=p~UictrV{s`sE^e#;i&@MA4kmn;tx$H~sY@YGI z<{s2J^wLSwxb;u?Wfg!kcThjXy)qZrGz-g5l2sTROLA(Mj;)GvSn4J=6fhrtD_cr> zEtmXN@k+d`sXqttV+yr&1UM#sybd&yjOpJ(SrT02lFYQDjU2E$GGnH2C3sIT4KO)9 z3?FWZrIqR{+-o2e1Lwr1zm~ZI&U!8WsGbo)CVU|#==_dpoC{+B1AxrRHG-~4uy2^B znUl*pW9fVvJ9A{q<(4{~D)FS|6gee_n+-%;oq28-*7|TP(Hq0gW=C19f-?xp5Sjjb z_1yE&ZrC!*xRM}G_gS`VuJPq+NQa0)(7u{zT;f0fsj#)JJ?VqBpj0r%-o>t(+^7~p zt3jtC&)+MZ##%$cJO(ZKAiA=X#F}m#H85?$M#H>e(h^TjnC?gfcDHx@3L+?|M2lVC+779K5nUMaR9NeafE7LP z23uQ;xx53_@wdN#*I?q|6r_Gte-`qQzFmma09R{e-WMHITNWbAkSuRos26m=gYZ;% zB*1yR#k%fR6%HO{Ah>5xj~?r5fC#=CfuU;V!FCvIv;(#>$H;dZ3BMXS60xfE@X9UX zRd5NYvem6Qxx>qI_(5`XPFu^q!sSxSg;>i4G{7Z+Xk7I_8aX_V1;Z~|oz*m`$1o## z_WgjJd&EP?%hA9E0WV#c($NEnd7)Dv6?if z3n1dmbL50XDD$2OyXH2}2d6;3nl$Ls)Be+G4Han&%9y@!SqpYpDti2r`^V>PLVDJv z$#F7r;v~CwtI4rbtL>j=yC01{;Eg`y97fk0VjU=HZo&tzwTDb>=@#sX>2fJ1W}(1s zJwiU}k0;4EbSNm*18rhM`4@2=DINPW%T8R&548ChZImZhQJ;?JEc@rV!=U@yf;y_& z>d=-aedL=lJ!JgXSHHr_ol5X;wc{s2}do~p^5iDVm^rcioF9SyXpe*4mqc*f8Q&WVAnde`v zpn1-gKa&LV7BCHdP%_1t-)qe#rHUkar-3c%3`!S-I`D%<^Bb98qMBc#DkEAcXI2@vd#h)e?;je1Q`1*v z012ASmZ}O_laP_qsO|Y-?{e7({m}LWA?I==s&AuU)7seAQJSW~(lDEi)DYqPSd7zd zzx(ETiD4o5p)GRQiae0!`H)-v@P)pomHPaszSsVIXG{EjQzq8cnUmC@7#f3D%F0T_ zXo@U}!g`3G$5!2BmKSa0Vy7f_Yr#oMbP{fG$G&PjH%#}DO5rvm_8DHBpgDPvuG_CQ zmx*Ar%XgiUxBRK8!z8(c+fqK-Eft^WNdXr_ z(>384Dd>w0^+qJ}>1M&+4RBD~t?2VX?1Fjel6v|vVCmg54@eX5`5ivQ{L8p5j+q|L z2Hex``_I?C0jO?cdRRB)q;!rIFy)e9QfbHxa}57TA{Pb`ete_?;;p51TkK2@eB2If zyl0@|*2W3o76Pg_<_u(X?~ofNX4i2jPrf@kwC@-25TBAH28xU^|KCcOVG5#;ndjlQQ#w4~1U{Avs`8>pnp9D(|dg4I}=L%ffPH zGvZN^3Zhb06N&?3jNI%@-X{khvkHeqPAWIqZH=fR6J`%0w3V6~Z=p?l^W5;vSI5{c zb=z@CeXnN7QmwIIiXU6VMUORXV~JY17n9!qtg#{r>BClZR=0u19IG|0%2 ztZ?I{q2l4DRUdnYKWOS85_{YyOFkU^#&z?&Z8Uw1_{scgQ{~!91&?eMyV;NJML`fG)P-Wrmc0pAuy`~;iFaol*8CR<@YhXQ2g zrnZr&9VfQJS$(R0^1)9T0bWG_uW*1@dBCd@;8i>2UV-6AUu)b>bG)Y^={t=HUL!++ zg>0%c%_zUTHmOv5{NG8Ka*qY@$YhADnREAygwG52=uBfiq<@k3P3jpE+|$3u|KR;9 zv=2u$iYP-_7&fedoI zMcu9^8P2O0?3eN*j02oIMYKd=sm)lu2|j}$NMJN05fr*q?;r$th|nJc5qAz0qCLXi zS48}ob5-5q9?s1e)J$V zr&C3b!)$+JFZ@_5VrA_uEm!>T+BKuwQ3ESgw=|o@O2vN(Uod0El~~`uqhpxn+x?Pt z-9W&VX+o13B}!HXdgg{#W)>|@DqEQS`Gf?jPtkD1<+7U^Zi9~`F?6{$J7#2n@{;8i zX9*KM(nTu)^BKFrf}2_?kY8FJWvSQ*v)FJB@T2;K)bA3bTM_0w#xYVL4M)-4|Fsyg7{Hc$tr zo-*%^$m~t{iG+%^|D8%(!U8b-g$mkNmV8UgP}y(bD`BWhI1MgYqW5p~ z8S$p7n)m~#w={BW^|%3AI+mX#1W?muSwnx=L@U+F_vje&_eAZ>>Ko)F`q6D(&B1Nf zGz)91?fTgyM{|7suI+&{L{9Fi;2}h9bNx!6>MSw~(*qwC4Vf!a4eaeK6?KuNT9v~q z7C5=mr(s}x+D&TitWqt%OS81^w4M(O6$l+rhitLoZ35R}wu~Mo7=s*`%ne{njOsHL zT5RV`dc&oQ?!U*?vH-a-IVOZ?QRuruRLKpLD_Lp*pjuu$-7V2yA@qhT(@B}Wu z{7gIPJ%%>D`<_Zq7#cMz+cssdXTmd~3uMqX{gFqOE}S_vYJ@eGR#x}T1df$u;VCgf zqx9tk*+a7_jhYapwa*=VL@Y-V=1I*a5r9dK^EpUT6itn>T%+I`Kwpw9%OjV;Ws?$5 z-uEeqfVm$So`J9*+5mT*nYhSZE9oGdFY$izlG2KNNjCsk<1voucPuA@V_)YiTDtQ< zjz}7gVhE>%ab{sTDK2ATXH??Tb3X4mDC}P+IO{~dXN~hJmNAYiyQrs+??j`;(eoTB z^3({gvnHq+%PN~OnZ4^(U^!cy#Mw=Mj*}ME$cvJ2+g7&cyeQu=@X;?;S-t-0LPm`< zOxM&8$DsU0Ukqgx2m5f`r5`<wO)9ofCv;BHh)ZG3Ii-Ej~{qDeH)eRYyS)*+E zZxa}alKD|qTv4dPF^aunN(uVQOu!G%GAeUlG|cozPbhyJDtSLbtNlb)(Uj$)>7=RU z^WS47R!{Y%2vTL1?R3Vi*7K3f<7SmnzgU4G;e?PI3C?15AK3WD&@4B3Bc>8S>&E{5 zno#w%(fE2R%JYDQ!^xu zKWe$@u5ph#RI9GnQ>>%7bQA!x} zMH~s0BsTkII4rs>MI_AZ#S>J`qS_{(7=?eYBz@Jp0jv#&D2}OOpxTU?Dy~x3yP7p` zy+(3Fl4qggAI$No(@xJQ`^{_tN9?x00%1Hwj6OvO%J|KjvAVkfhrd9RXGq^o-lQSi_-z z+oU+_pA;Pe*FO)eZTK27QfxN*B;HCvZLHC*Y?U>w4Cq&>B6Sz|@H+R=%dOcSH5+>Q zp!&(cY0_WTVb<6am9X>^gz?IA(yT#qdixrk2MrBx{UidKMA&(e8SG+*FE>IzDI%53 z3hV`shbpKW>e@b(63059ix>h4!VL?Oa8=jRHgT9~6l%o+pJ2){k`itZ6~CbwR&I__ zeyq~fLGNN!g-H>IEW@NLCx#ydPj(kkhYWzh^2SH1pUL4N*szG{XTL$E9Vue9#F*qy z7yVx3u0NZbFHl7{1{$FI`=eBL?}lmTqwEqZ3Qt05hb$Md6y%T0>1(%tJpB$_Jrqy$ z@?^60OoeZFl;wLyIP^yP=sHzi-NoSM*vpfaEujtcAU)xrI`Ngyh*={RgmDkos_iLY zpEO#u+$|)?P%)&X)fs%R|5YIyC=1LO8ZDSGS<|XA7-lGzFu|Fe5fx|nGg^mmhpZQc zfUw-+hTEh*aw=YFt+f>S^mkoy>25Oi_=7+}dfZB%# zVKpiCzdr}y1n#_v%i;H~Njm`>X&cP{s2dJwi(3k~rc#9z6nzypbZK{#(#3b&M!3lM z-R~q*i4y5J)T|=aus88g{W(f0<&-M>0xvWc?=b0>p5D-ofMt0}j&yR*5d&zu5G)Q8 zoI;n>MTVQLPCk3I8`)}0_E1mNCyg7odMPex2}yJ6ezFqTPW>@21VLyd-g5m?Q8cT875O%kt5 zu+_j~qcI;O`xYoQO>6^yK@|1z!+mYCaQ2y-+@Bb`8#;5Vct%N zz|I1Z&ha;k&_G$sP7c3}w|LB#YGujUU9z?+tu&fmSs`2y?7H+@SpbbEYT$q-^^V}SEqC|mu~w(z54DL*Il$pmr<+AFK-;(?o_LAL zmuTE0U%g|pQMUWR!6v<2)_18v#{N9zLcK&pm(H7F)fv(8MTceqh$_%Uxy)-NvElxo}JHM*FA)A+H+8~H@cM2IYDtQRX!7cG5MR5QrakW0OjBfHDdn$x|lRoP76J? z4L?`MS1g+2xOCZRPM5@a!Bkq}ZS3kze{(RWn^L-%Hlr)7W8M#I)UU*zugNMb@5a@Y zfT*!hQ%oFRdOKax>j^=fg*akn#EU46VB~H?3aO<94l=>kXWzY0mzv*$=A?75zSSF65~|& zl;b>{T4!A>)ujrJeceqN7qJ*7zp=W>n8g}BQ`od0Kpx4m?*;clp=Cd7hO5U@cYfKQduHRDeAZHV&uI}N)qHt10K)7??DKi-_~6v;W?6z6GORjq zAy~}t=B__)%rE3K6A}>~;-cWnhkZANR*0o{>O+R`Du1#>u(Z7a868CgknV5=kkoiK z=gG28yGux#CG1aobJuGqn|9U=1yLdT8#oBz&~ZetlmH|$O1h`aKnGun6*nY>#_XvBb z)yrO=NR{`UlxJkPiP;848zD~Op?oLdUWFn(!bLp$fiGvx6eJhrm`CJ<=y;aT$}zOL zv}7nvF02ZBzVOo>yT77=;l%Lp&a*V?gh4+i2|}7qA*yg{;e#t4GU{Ep{G=uYK8GEFERDd8B%L%M0~N+1MZ*jI$(c}Wimc0tuiP{)I4B!_~dxU z*M`DFY=h*Eo(()cn_6;Pm!s6el{6M;2g``aM1me%(I92}PDc7#Eg15GrIekvEPF%C z9DY7*UiAwGDNu?KuZz0>5`|*C2i<_l27nuqiI!L%eYTW{TJc<-;$6vxDs-su?(+P~ zk%Xc@`kX=6JNo3EgscU8k!(aS;H~QJr6Ox;2&37QmLu@?$}6|P_IVjiC_}RsA#qtS zCA`cm^yv|LR|hsPhIUI>oktuY|AYxvn*Mh&CJz!&ckc3cJ8mqtmT?1Ps^a6a)K)4s7lOr``b_ah&0A6l)>C1-DaDQRSj zdfXGiCODhRGit?@l=d7qdg5YF<}&NgKEdzN1EDYJVJElib9?2-dIP)-_BL1$jK1%) zf;|$njiVvH{6-pA_*=vWA$b&tK znyF-<*z3jg)OJ6Wbb_)60r-j3qa*(}a#Y~$NG9v^+l$l1+<_Sh$CCiq9SY7h8B@6r z#Ho%Bb?uDnl>@!FAp~!X4Zb+bLb}G=&G?8X#5xU{Hm~Gc1*&}&@f{;%d>R7p5nN^T zwtGMjU5T-p;VLNueD{U@YyUj*TzVpE$v%D&$}99ne2aNp#f{=O2Uf|F)QFPB{(Ppm z(Yvx=J!k)$(`Q-8s00<`V!T^N=>SzF808%!<%rHi5yVWRH+a`dlVZd`#J`Mp*%`N$}UX zT%jxcSI&HA<2?mw)|FfGfjKJx&A(-e~W zt2+qT&svCIWE_PNL1syR=;s0?+BNB@0_9{zY#H)f4)A%fpjJ*Fb@>;KI``BFQY93>;}~EB#!0EGYX-5kvmmUq`;1 z{mk6d7W@*w;E%^zh@>o9LGu^r`$oxo)$H`uTk*osJ!6dU^?1@df|NNuFN3aW?Yk17 zYJ{^YE_kVwVbMu7Qb9p*&iQHWcWrwIwH7K_}AAR4;;k|Q)P z=_$|{C*~!eh3-i1AO!VXa7*e)y5EMwcPVQ@ldzFzg1>G&8WYjbvZsR>rMod3R}b0+dk#>oDSR`@`h8>9EyRp?ZQ z5xKWbpAl1?LUf(Z7PL;G76O) zspv?_L~9tyw!>S~u60pb`#6+MTczeX%!<;H0VfpQ)z<{W5}7;j%gXvk z>*7g|)sw!_iLQ>~vS{@NW_gR>((+DhxWAs9R=IgM~oa)I=0Npz#;1&rY_$qIxas>AW%bq z0ZF!Xcq|7~4{Y*fuuvNN zAoz&voAY2u-FJxS}t17T8?bK|83WZMiI68pXqmT>-2rt&DgimUH@`I$eeC4=ZPT38 zpdzVj>qU#_Tx700<=e^5Knv_E>zcIvk;{k-blmM(5k0D$#JT0`xON@PVU`hb7?b$j z4ScM|xxFFUe`_)Ii|ZP|CCx&##q|y+kk+u1T04BJGo2=iO)qw}kIU_fTo=BQFg_(` z=!(zP7+wIpwqcv;W;KSPNXsCn&Z=gGh7Hwp5`VKWR1WX0}$J z3zmk}w}DNy^f$+gwzuk6%_-j6=S?9GWXn&v4S=$r)l@xyG4N?oR%^*~r;Z*bQ40Rr zZ7=K=|7F(i{C73v@ts*v^-XYpRjM}GEhB;Q+ABSE3aa+jMDN&#o|GaA$6P7 zAM5YyuO}uq0c83YkJ1w24Bt>ntVo6&bVlFCk2$pXp|jM|9p{2AtIZVBzmgBWuH#Rm z?L8~fkL3Zx%5tOn#DegWuiU>WzF0Td1kd|+jI#xwT<%6*(!|mH1UbGNi}C#o!@CvH zd{SG7u04`fBQay_FHRUK={py%Ki03%m^a~mcIH@)*L4vfE;C|-_ zV7ISC;KTod}0fzhClBuF}ZIjeHrU8y_3z^yNJXm%Bm@ZKvRa=nu( zb@+AkqHpOL_inXQgY6Rko;v+~D}DIYe!BvFkZcQ|QO9U)bGUpXZ;-0`U5m~eJu!CN z4mm1Wan>RAwAo*qEHDDh)qVpnLt0Vhy|;1`ADLLa#j|wswb6F&jQf253O*Ll?&+ad zCp!uqSgsJ+k8p|?l3v8*C_8pXUiZ12{OOkTw|t`H`cnG+c59#|lBH{>hswO?k}kPA zty^RAfaMhWGxH^RoEL+E`csf0CK(Y(7n4#}UzS8d1u2zNzY$ecGEkP9~9jo+^;AP;6CIY#e<#T8^x zEr=Z|q}ioVXgBF226p7*W&`+{_T{2_Ik+4eRoxh&`QdAP=|7^U#a$exRWujeOfma+ zyFOdvP=s}#NSlCR7QVytl?`vwm>P*00MQJO5eM?GNM>2uNS#kcCwTzq-8Iws&squ+ z5r)hClk7OK{iW<0tiLc}C#ifr-Si}S{14|g;U_0nIXxKgVI(|tgqbCYx*p{F)zsKQ zwoh(Hz_6{c?v~0J9@Tw8xyK}PTpsTlZpz`OXAJ-=sTNaSrh7>BgzpsN&%VKEwd6Qm^DXO@W1<)NxB_8rLaqy zl|pB47f`2fXuHUhw(H}MUOlS5$^v?nRU_(QdDEL*{?4@ zQRd?oiFYQPfO3wja|Yu^E3rt^7P0#mAiS-k6EdYZ(ry&qZ4sDlJw4mtq{-dsGIkgJ zX_c$#m0VQ$Me{9+2U}ZYbOeZH$O-LI?0Gx{(zkCe5QaU4+njdHZG%g7KI9+OGzAV1 zP75*5VvqgUi|{9CDAC%Pyww~Lr@_qSu!ojSrnW>)8eysJ$CTR@iFL{vm-=p=(&aN( zJ>j4v^P0WOxvfHUI?~cKi_}s!g?0{4`EAs7{Hbwd&fP0yiw!h`5ook@CYx+y9W&Ii z9Txy1N&TzA0?A>BnPPL)`dVR9S0Y+g6M_we2o&6r2(-S1jkoIu%bFoRF%t2z?WLRJ ztU~3}Nn3ExO^QZvqtmPio^#F-C%=VaTyFBT@7T$x9v$Tm=wb)sjME zdmDYxO1o^OTM8t3JtOlUC*!;cxOlORL(}QS;|Yy9rk$AOX1EfP!{yg#duSM=~OJ?-UYd zie_x&1(r{f2*m;_Q#c3^r7tIYGvj(#Jqc(%^eU*4auV6Fb%3|<^VaQCYR(14a`o7N zvgOlf?RljH^~RUA>nfRC(a1#!V%YI(t9|HrwXLdfAe#*Q$VAEEX_{DdM>j#DR57)p zYLDDE?0@w2xrj2MCBz^g!-D^b_=AFBfc#hDfd8(&|4-t8|7riHGy?ze0tfPc*6IJ3 zmz96T|E}8quipQo+yC!eg#YOF|I5Dr75}T<|C@^cza#mNum6tnfBF(9`Ty|szqkyF W07$6+^bPUv!1|YkfK%u{cmEfx0Wqoo literal 0 HcmV?d00001 diff --git a/project/MOOCSettings.scala b/project/MOOCSettings.scala new file mode 100644 index 0000000..171244f --- /dev/null +++ b/project/MOOCSettings.scala @@ -0,0 +1,46 @@ +package ch.epfl.lamp + +import sbt._ +import sbt.Keys._ + +/** + * Coursera uses two versions of each assignment. They both have the same assignment key and part id but have + * different item ids. + * + * @param key Assignment key + * @param partId Assignment partId + * @param itemId Item id of the non premium version + * @param premiumItemId Item id of the premium version (`None` if the assignment is optional) + */ +case class CourseraId(key: String, partId: String, itemId: String, premiumItemId: Option[String]) + +/** + * Settings shared by all assignments, reused in various tasks. + */ +object MOOCSettings extends AutoPlugin { + + object autoImport { + val course = SettingKey[String]("course") + val assignment = SettingKey[String]("assignment") + val options = SettingKey[Map[String, Map[String, String]]]("options") + val courseraId = settingKey[CourseraId]("Coursera-specific information identifying the assignment") + val testSuite = settingKey[String]("Fully qualified name of the test suite of this assignment") + // Convenient alias + type CourseraId = ch.epfl.lamp.CourseraId + val CourseraId = ch.epfl.lamp.CourseraId + } + + import autoImport._ + + override val globalSettings: Seq[Def.Setting[_]] = Seq( + // supershell is verbose, buggy and useless. + useSuperShell := false + ) + + override val projectSettings: Seq[Def.Setting[_]] = Seq( + parallelExecution in Test := false, + // Report test result after each test instead of waiting for every test to finish + logBuffered in Test := false, + name := s"${course.value}-${assignment.value}" + ) +} diff --git a/project/StudentTasks.scala b/project/StudentTasks.scala new file mode 100644 index 0000000..7604830 --- /dev/null +++ b/project/StudentTasks.scala @@ -0,0 +1,318 @@ +package ch.epfl.lamp + +import sbt._ +import Keys._ + +// import scalaj.http._ +import java.io.{File, FileInputStream, IOException} +import org.apache.commons.codec.binary.Base64 +// import play.api.libs.json.{Json, JsObject, JsPath} +import scala.util.{Failure, Success, Try} + +/** + * Provides tasks for submitting the assignment + */ +object StudentTasks extends AutoPlugin { + + override def requires = super.requires && MOOCSettings + + object autoImport { + val packageSourcesOnly = TaskKey[File]("packageSourcesOnly", "Package the sources of the project") + val packageBinWithoutResources = TaskKey[File]("packageBinWithoutResources", "Like packageBin, but without the resources") + val packageSubmissionZip = TaskKey[File]("packageSubmissionZip") + val packageSubmission = inputKey[Unit]("package solution as an archive file") + val runGradingTests = taskKey[Unit]("run black-box tests used for final grading") + } + + + import autoImport._ + import MOOCSettings.autoImport._ + + override lazy val projectSettings = Seq( + packageSubmissionSetting, + // submitSetting, + runGradingTestsSettings, + + fork := true, + connectInput in run := true, + outputStrategy := Some(StdoutOutput), + ) ++ packageSubmissionZipSettings + + lazy val runGradingTestsSettings = runGradingTests := { + val testSuiteJar = "grading-tests.jar" + if (!new File(testSuiteJar).exists) { + throw new MessageOnlyException(s"Could not find tests JarFile: $testSuiteJar") + } + + val classPath = s"${(Test / dependencyClasspath).value.map(_.data).mkString(File.pathSeparator)}${File.pathSeparator}$testSuiteJar" + val junitProcess = + Fork.java.fork( + ForkOptions(), + "-cp" :: classPath :: + "org.junit.runner.JUnitCore" :: + (Test / testSuite).value :: + Nil + ) + + // Wait for tests to complete. + junitProcess.exitValue() + } + + + /** ********************************************************** + * SUBMITTING A SOLUTION TO COURSERA + */ + + val packageSubmissionZipSettings = Seq( + packageSubmissionZip := { + val submission = crossTarget.value / "submission.zip" + val sources = (packageSourcesOnly in Compile).value + val binaries = (packageBinWithoutResources in Compile).value + IO.zip(Seq(sources -> "sources.zip", binaries -> "binaries.jar"), submission) + submission + }, + artifactClassifier in packageSourcesOnly := Some("sources"), + artifact in (Compile, packageBinWithoutResources) ~= (art => art.withName(art.name + "-without-resources")) + ) ++ + inConfig(Compile)( + Defaults.packageTaskSettings(packageSourcesOnly, Defaults.sourceMappings) ++ + Defaults.packageTaskSettings(packageBinWithoutResources, Def.task { + val relativePaths = + (unmanagedResources in Compile).value.flatMap(Path.relativeTo((unmanagedResourceDirectories in Compile).value)(_)) + (mappings in (Compile, packageBin)).value.filterNot { case (_, path) => relativePaths.contains(path) } + }) + ) + + val maxSubmitFileSize = { + val mb = 1024 * 1024 + 10 * mb + } + + /** Check that the jar exists, isn't empty, isn't crazy big, and can be read + * If so, encode jar as base64 so we can send it to Coursera + */ + def prepareJar(jar: File, s: TaskStreams): String = { + val errPrefix = "Error submitting assignment jar: " + val fileLength = jar.length() + if (!jar.exists()) { + s.log.error(errPrefix + "jar archive does not exist\n" + jar.getAbsolutePath) + failSubmit() + } else if (fileLength == 0L) { + s.log.error(errPrefix + "jar archive is empty\n" + jar.getAbsolutePath) + failSubmit() + } else if (fileLength > maxSubmitFileSize) { + s.log.error(errPrefix + "jar archive is too big. Allowed size: " + + maxSubmitFileSize + " bytes, found " + fileLength + " bytes.\n" + + jar.getAbsolutePath) + failSubmit() + } else { + val bytes = new Array[Byte](fileLength.toInt) + val sizeRead = try { + val is = new FileInputStream(jar) + val read = is.read(bytes) + is.close() + read + } catch { + case ex: IOException => + s.log.error(errPrefix + "failed to read sources jar archive\n" + ex.toString) + failSubmit() + } + if (sizeRead != bytes.length) { + s.log.error(errPrefix + "failed to read the sources jar archive, size read: " + sizeRead) + failSubmit() + } else encodeBase64(bytes) + } + } + + /** Task to package solution to a given file path */ + lazy val packageSubmissionSetting = packageSubmission := { + val args: Seq[String] = Def.spaceDelimited("[path]").parsed + val s: TaskStreams = streams.value // for logging + val jar = (packageSubmissionZip in Compile).value + + val base64Jar = prepareJar(jar, s) + + val path = args.headOption.getOrElse((baseDirectory.value / "submission.jar").absolutePath) + scala.tools.nsc.io.File(path).writeAll(base64Jar) + } + +/* + /** Task to submit a solution to coursera */ + val submit = inputKey[Unit]("submit solution to Coursera") + lazy val submitSetting = submit := { + // Fail if scalafix linting does not pass. + scalafixLinting.value + + val args: Seq[String] = Def.spaceDelimited("").parsed + val s: TaskStreams = streams.value // for logging + val jar = (packageSubmissionZip in Compile).value + + val assignmentDetails = + courseraId.?.value.getOrElse(throw new MessageOnlyException("This assignment can not be submitted to Coursera because the `courseraId` setting is undefined")) + val assignmentKey = assignmentDetails.key + val courseName = + course.value match { + case "capstone" => "scala-capstone" + case "bigdata" => "scala-spark-big-data" + case other => other + } + + val partId = assignmentDetails.partId + val itemId = assignmentDetails.itemId + val premiumItemId = assignmentDetails.premiumItemId + + val (email, secret) = args match { + case email :: secret :: Nil => + (email, secret) + case _ => + val inputErr = + s"""|Invalid input to `submit`. The required syntax for `submit` is: + |submit + | + |The submit token is NOT YOUR LOGIN PASSWORD. + |It can be obtained from the assignment page: + |https://www.coursera.org/learn/$courseName/programming/$itemId + |${ + premiumItemId.fold("") { id => + s"""or (for premium learners): + |https://www.coursera.org/learn/$courseName/programming/$id + """.stripMargin + } + } + """.stripMargin + s.log.error(inputErr) + failSubmit() + } + + val base64Jar = prepareJar(jar, s) + val json = + s"""|{ + | "assignmentKey":"$assignmentKey", + | "submitterEmail":"$email", + | "secret":"$secret", + | "parts":{ + | "$partId":{ + | "output":"$base64Jar" + | } + | } + |}""".stripMargin + + def postSubmission[T](data: String): Try[HttpResponse[String]] = { + val http = Http("https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1") + val hs = List( + ("Cache-Control", "no-cache"), + ("Content-Type", "application/json") + ) + s.log.info("Connecting to Coursera...") + val response = Try(http.postData(data) + .headers(hs) + .option(HttpOptions.connTimeout(10000)) // scalaj default timeout is only 100ms, changing that to 10s + .asString) // kick off HTTP POST + response + } + + val connectMsg = + s"""|Attempting to submit "${assignment.value}" assignment in "$courseName" course + |Using: + |- email: $email + |- submit token: $secret""".stripMargin + s.log.info(connectMsg) + + def reportCourseraResponse(response: HttpResponse[String]): Unit = { + val code = response.code + val respBody = response.body + + /* Sample JSON response from Coursera + { + "message": "Invalid email or token.", + "details": { + "learnerMessage": "Invalid email or token." + } + } + */ + + // Success, Coursera responds with 2xx HTTP status code + if (response.is2xx) { + val successfulSubmitMsg = + s"""|Successfully connected to Coursera. (Status $code) + | + |Assignment submitted successfully! + | + |You can see how you scored by going to: + |https://www.coursera.org/learn/$courseName/programming/$itemId/ + |${ + premiumItemId.fold("") { id => + s"""or (for premium learners): + |https://www.coursera.org/learn/$courseName/programming/$id + """.stripMargin + } + } + |and clicking on "My Submission".""".stripMargin + s.log.info(successfulSubmitMsg) + } + + // Failure, Coursera responds with 4xx HTTP status code (client-side failure) + else if (response.is4xx) { + val result = Try(Json.parse(respBody)).toOption + val learnerMsg = result match { + case Some(resp: JsObject) => + (JsPath \ "details" \ "learnerMessage").read[String].reads(resp).get + case Some(x) => // shouldn't happen + "Could not parse Coursera's response:\n" + x + case None => + "Could not parse Coursera's response:\n" + respBody + } + val failedSubmitMsg = + s"""|Submission failed. + |There was something wrong while attempting to submit. + |Coursera says: + |$learnerMsg (Status $code)""".stripMargin + s.log.error(failedSubmitMsg) + } + + // Failure, Coursera responds with 5xx HTTP status code (server-side failure) + else if (response.is5xx) { + val failedSubmitMsg = + s"""|Submission failed. + |Coursera seems to be unavailable at the moment (Status $code) + |Check https://status.coursera.org/ and try again in a few minutes. + """.stripMargin + s.log.error(failedSubmitMsg) + } + + // Failure, Coursera repsonds with an unexpected status code + else { + val failedSubmitMsg = + s"""|Submission failed. + |Coursera replied with an unexpected code (Status $code) + """.stripMargin + s.log.error(failedSubmitMsg) + } + } + + // kick it all off, actually make request + postSubmission(json) match { + case Success(resp) => reportCourseraResponse(resp) + case Failure(e) => + val failedConnectMsg = + s"""|Connection to Coursera failed. + |There was something wrong while attempting to connect to Coursera. + |Check your internet connection. + |${e.toString}""".stripMargin + s.log.error(failedConnectMsg) + } + + } +*/ + + def failSubmit(): Nothing = { + sys.error("Submission failed") + } + + /** + * ***************** + * DEALING WITH JARS + */ + def encodeBase64(bytes: Array[Byte]): String = + new String(Base64.encodeBase64(bytes)) +} diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 0000000..a919a9b --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.3.8 diff --git a/project/buildSettings.sbt b/project/buildSettings.sbt new file mode 100644 index 0000000..8fac702 --- /dev/null +++ b/project/buildSettings.sbt @@ -0,0 +1,5 @@ +// Used for Coursera submission (StudentPlugin) +// libraryDependencies += "org.scalaj" %% "scalaj-http" % "2.4.2" +// libraryDependencies += "com.typesafe.play" %% "play-json" % "2.7.4" +// Used for Base64 (StudentPlugin) +libraryDependencies += "commons-codec" % "commons-codec" % "1.10" diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..017735d --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,2 @@ +addSbtPlugin("org.scala-js" % "sbt-scalajs" % "0.6.28") +addSbtPlugin("ch.epfl.lamp" % "sbt-dotty" % "0.4.0") diff --git a/src/main/resources/wikipedia/.keep b/src/main/resources/wikipedia/.keep new file mode 100644 index 0000000..e69de29 diff --git a/src/main/scala/wikipedia/WikipediaData.scala b/src/main/scala/wikipedia/WikipediaData.scala new file mode 100644 index 0000000..2bfc238 --- /dev/null +++ b/src/main/scala/wikipedia/WikipediaData.scala @@ -0,0 +1,21 @@ +package wikipedia + +import scala.io.Source + +object WikipediaData { + + private[wikipedia] def lines: List[String] = { + Option(getClass.getResourceAsStream("/wikipedia/wikipedia-grading.dat")) match { + case None => sys.error("Please download the dataset as explained in the assignment instructions") + case Some(resource) => Source.fromInputStream(resource).getLines().toList + } + } + + private[wikipedia] def parse(line: String): WikipediaArticle = { + val subs = "" + val i = line.indexOf(subs) + val title = line.substring(14, i) + val text = line.substring(i + subs.length, line.length-16) + WikipediaArticle(title, text) + } +} diff --git a/src/main/scala/wikipedia/WikipediaRanking.scala b/src/main/scala/wikipedia/WikipediaRanking.scala new file mode 100644 index 0000000..1306b61 --- /dev/null +++ b/src/main/scala/wikipedia/WikipediaRanking.scala @@ -0,0 +1,96 @@ +package wikipedia + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.apache.log4j.{Logger, Level} + +import org.apache.spark.rdd.RDD + +case class WikipediaArticle(title: String, text: String) { + /** + * @return Whether the text of this article mentions `lang` or not + * @param lang Language to look for (e.g. "Scala") + */ + def mentionsLanguage(lang: String): Boolean = text.split(' ').contains(lang) +} + +object WikipediaRanking extends WikipediaRankingInterface { + // Reduce Spark logging verbosity + Logger.getLogger("org").setLevel(Level.ERROR) + + val langs = List( + "JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS", + "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy") + + val conf: SparkConf = ??? + val sc: SparkContext = ??? + // Hint: use a combination of `sc.parallelize`, `WikipediaData.lines` and `WikipediaData.parse` + val wikiRdd: RDD[WikipediaArticle] = ??? + + /** Returns the number of articles on which the language `lang` occurs. + * Hint1: consider using method `aggregate` on RDD[T]. + * Hint2: consider using method `mentionsLanguage` on `WikipediaArticle` + */ + def occurrencesOfLang(lang: String, rdd: RDD[WikipediaArticle]): Int = ??? + + /* (1) Use `occurrencesOfLang` to compute the ranking of the languages + * (`val langs`) by determining the number of Wikipedia articles that + * mention each language at least once. Don't forget to sort the + * languages by their occurrence, in decreasing order! + * + * Note: this operation is long-running. It can potentially run for + * several seconds. + */ + def rankLangs(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] = ??? + + /* Compute an inverted index of the set of articles, mapping each language + * to the Wikipedia pages in which it occurs. + */ + def makeIndex(langs: List[String], rdd: RDD[WikipediaArticle]): RDD[(String, Iterable[WikipediaArticle])] = ??? + + /* (2) Compute the language ranking again, but now using the inverted index. Can you notice + * a performance improvement? + * + * Note: this operation is long-running. It can potentially run for + * several seconds. + */ + def rankLangsUsingIndex(index: RDD[(String, Iterable[WikipediaArticle])]): List[(String, Int)] = ??? + + /* (3) Use `reduceByKey` so that the computation of the index and the ranking are combined. + * Can you notice an improvement in performance compared to measuring *both* the computation of the index + * and the computation of the ranking? If so, can you think of a reason? + * + * Note: this operation is long-running. It can potentially run for + * several seconds. + */ + def rankLangsReduceByKey(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] = ??? + + def main(args: Array[String]): Unit = { + + /* Languages ranked according to (1) */ + val langsRanked: List[(String, Int)] = timed("Part 1: naive ranking", rankLangs(langs, wikiRdd)) + + /* An inverted index mapping languages to wikipedia pages on which they appear */ + def index: RDD[(String, Iterable[WikipediaArticle])] = makeIndex(langs, wikiRdd) + + /* Languages ranked according to (2), using the inverted index */ + val langsRanked2: List[(String, Int)] = timed("Part 2: ranking using inverted index", rankLangsUsingIndex(index)) + + /* Languages ranked according to (3) */ + val langsRanked3: List[(String, Int)] = timed("Part 3: ranking using reduceByKey", rankLangsReduceByKey(langs, wikiRdd)) + + /* Output the speed of each ranking */ + println(timing) + sc.stop() + } + + val timing = new StringBuffer + def timed[T](label: String, code: => T): T = { + val start = System.currentTimeMillis() + val result = code + val stop = System.currentTimeMillis() + timing.append(s"Processing $label took ${stop - start} ms.\n") + result + } +} diff --git a/src/main/scala/wikipedia/WikipediaRankingInterface.scala b/src/main/scala/wikipedia/WikipediaRankingInterface.scala new file mode 100644 index 0000000..eacef4f --- /dev/null +++ b/src/main/scala/wikipedia/WikipediaRankingInterface.scala @@ -0,0 +1,20 @@ +package wikipedia + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.RDD + +/** + * The interface used by the grading infrastructure. Do not change signatures + * or your submission will fail with a NoSuchMethodError. + */ +trait WikipediaRankingInterface { + def makeIndex(langs: List[String], rdd: RDD[WikipediaArticle]): RDD[(String, Iterable[WikipediaArticle])] + def occurrencesOfLang(lang: String, rdd: RDD[WikipediaArticle]): Int + def rankLangs(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] + def rankLangsReduceByKey(langs: List[String], rdd: RDD[WikipediaArticle]): List[(String, Int)] + def rankLangsUsingIndex(index: RDD[(String, Iterable[WikipediaArticle])]): List[(String, Int)] + def langs: List[String] + def sc: SparkContext + def wikiRdd: RDD[WikipediaArticle] +} diff --git a/src/test/scala/wikipedia/WikipediaSuite.scala b/src/test/scala/wikipedia/WikipediaSuite.scala new file mode 100644 index 0000000..aff7d25 --- /dev/null +++ b/src/test/scala/wikipedia/WikipediaSuite.scala @@ -0,0 +1,142 @@ +package wikipedia + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.junit._ + +class WikipediaSuite { + def initializeWikipediaRanking(): Boolean = + try { + WikipediaRanking + true + } catch { + case ex: Throwable => + println(ex.getMessage) + ex.printStackTrace() + false + } + + import WikipediaRanking._ + + /** + * Creates a truncated string representation of a list, adding ", ...)" if there + * are too many elements to show + * @param l The list to preview + * @param n The number of elements to cut it at + * @return A preview of the list, containing at most n elements. + */ + def previewList[A](l: List[A], n: Int = 10): String = + if (l.length <= n) l.toString + else l.take(n).toString.dropRight(1) + ", ...)" + + /** + * Asserts that all the elements in a given list and an expected list are the same, + * regardless of order. For a prettier output, given and expected should be sorted + * with the same ordering. + * @param actual The actual list + * @param expected The expected list + * @tparam A Type of the list elements + */ + def assertSameElements[A](actual: List[A], expected: List[A]): Unit = { + val givenSet = actual.toSet + val expectedSet = expected.toSet + + val unexpected = givenSet -- expectedSet + val missing = expectedSet -- givenSet + + val noUnexpectedElements = unexpected.isEmpty + val noMissingElements = missing.isEmpty + + val noMatchString = + s""" + |Expected: ${previewList(expected)} + |Actual: ${previewList(actual)}""".stripMargin + + assert(noUnexpectedElements, + s"""|$noMatchString + |The given collection contains some unexpected elements: ${previewList(unexpected.toList, 5)}""".stripMargin) + + assert(noMissingElements, + s"""|$noMatchString + |The given collection is missing some expected elements: ${previewList(missing.toList, 5)}""".stripMargin) + } + + // Conditions: + // (1) the language stats contain the same elements + // (2) they are ordered (and the order doesn't matter if there are several languages with the same count) + def assertEquivalentAndOrdered(actual: List[(String, Int)], expected: List[(String, Int)]): Unit = { + // (1) + assertSameElements(actual, expected) + // (2) + assert( + !(actual zip actual.tail).exists({ case ((_, occ1), (_, occ2)) => occ1 < occ2 }), + "The given elements are not in descending order" + ) + } + + @Test def `'occurrencesOfLang' should work for (specific) RDD with one element`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val rdd = sc.parallelize(Seq(WikipediaArticle("title", "Java Jakarta"))) + val res = (occurrencesOfLang("Java", rdd) == 1) + assert(res, "occurrencesOfLang given (specific) RDD with one element should equal to 1") + } + + @Test def `'rankLangs' should work for RDD with two elements`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val langs = List("Scala", "Java") + val rdd = sc.parallelize(List(WikipediaArticle("1", "Scala is great"), WikipediaArticle("2", "Java is OK, but Scala is cooler"))) + val ranked = rankLangs(langs, rdd) + val res = ranked.head._1 == "Scala" + assert(res) + } + + @Test def `'makeIndex' creates a simple index with two entries`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val langs = List("Scala", "Java") + val articles = List( + WikipediaArticle("1","Groovy is pretty interesting, and so is Erlang"), + WikipediaArticle("2","Scala and Java run on the JVM"), + WikipediaArticle("3","Scala is not purely functional") + ) + val rdd = sc.parallelize(articles) + val index = makeIndex(langs, rdd) + val res = index.count() == 2 + assert(res) + } + + @Test def `'rankLangsUsingIndex' should work for a simple RDD with three elements`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val langs = List("Scala", "Java") + val articles = List( + WikipediaArticle("1","Groovy is pretty interesting, and so is Erlang"), + WikipediaArticle("2","Scala and Java run on the JVM"), + WikipediaArticle("3","Scala is not purely functional") + ) + val rdd = sc.parallelize(articles) + val index = makeIndex(langs, rdd) + val ranked = rankLangsUsingIndex(index) + val res = (ranked.head._1 == "Scala") + assert(res) + } + + @Test def `'rankLangsReduceByKey' should work for a simple RDD with five elements`: Unit = { + assert(initializeWikipediaRanking(), " -- did you fill in all the values in WikipediaRanking (conf, sc, wikiRdd)?") + val langs = List("Scala", "Java", "Groovy", "Haskell", "Erlang") + val articles = List( + WikipediaArticle("1","Groovy is pretty interesting, and so is Erlang"), + WikipediaArticle("2","Scala and Java run on the JVM"), + WikipediaArticle("3","Scala is not purely functional"), + WikipediaArticle("4","The cool kids like Haskell more than Java"), + WikipediaArticle("5","Java is for enterprise developers") + ) + val rdd = sc.parallelize(articles) + val ranked = rankLangsReduceByKey(langs, rdd) + val res = (ranked.head._1 == "Java") + assert(res) + } + + + + @Rule def individualTestTimeout = new org.junit.rules.Timeout(100 * 1000) +}