VizByWiki: Mining Data Visualizations from the Web
to Enrich News Articles
!""#$%&' "($%)'$
*
+%,-./(0%1-23
4
+%5670$%!302
8
+%0$3%92#$7%:#;/7
*
%
*
<-27/=#.7#2$%>$'?#2.'76@%
4
!AA"#%B$;C%D=-2E%3-$#%07%7/#%>$'?#2.'76%-F%G'$$#.-70H@%
8
%>$'?#2.'76%-F%G';/'I0$%
0""#$C"'$J##;.C$-27/=#.7#2$C#3(+%F-23KL4KJ(M$C#3(+%#0302J(M';/C#3(+%N/#;/7J$-27/=#.7#2$C#3(%
ABSTRACT
O070%?'.( 0"'P07'-$.%'$%$#=.%027';"#.%D#CIC+%M0A.+%"'$#%I20A/.+%N02%
;/027.H%I2#07"6%#$2';/%7/#%;-$7#$7%-F%$#=.%027';"#.%0$3%2#.("7%'$%
=#""Q#.70N"'./#3% 'MA2-?#M#$7.% 7-% 2#03#2% ;-MA2#/#$.'-$C%
:-=#?#2+%#R'.7'$I%.6.7#M.%7/07%I# $#207#%$#=.% 3070%?'.(0"'P0Q
7'-$.%#'7/#2%2#S('2#%.(N.70$7'0"%M0$(0"%#FF-27%-2%02#%"'M'7#3%7-%
?#26% .A#;'F';% 76A#.%-F% 3070% ?'.(0"'P07'-$.+% 7/#2#N6%I2#0 7"6% 2#Q
.72';7'$I%7/ #%$(MN#2%-F%$#=.%027';"#.%7/07%;0$%N#%#$/0$;#3C%T-%
0332#..%7/'.%'..(#+%=#%3#F'$#%0%$#=%A2-N"#MU%I'?#$%0%$#=.%027'Q
;"#+% 2#72' # ?#% 2#"#?0$7% ?'.(0"'P07'-$.% 7/07% already' exist' on' the'
webC%V#%./-=%7/07%7/'.%A2-N"# M %' .%720;70N"#%7/2-(I/%0%$#=%.6.Q
7#M+%VizByWiki+%7/07%M'$#.%;-$7#R7(0""6%2#"#?0 $7%3070%?'.(0"'P0Q
7'-$.%F2-M%V'E'M#3'0% W-MM -$.+%7/#%;#$720"%F'"# %2#A-.'7-26%F-2%
V'E'A#3'0C%>.'$I%0%$-?#"%I2-($3% 72(7/% 3070.#7+% =#%./-=%7/07%
X'P96V'E'%;0$%.(;;#..F(""6%0(IM#$7%0.%M0$6%0.%LYZ%-F%A-A(Q
"02% -$" '$#% $#=.% 027';"#.% ='7/% $#=.% ?'.(0"'P07'-$.C% V#% 0".-%
3#M-$.7207#%7/07%X'P96V'E'%;0$%0(7-M07';0""6%20$E%?'.(0"'P0Q
7'-$.%0;;-23'$I% 7-% 7/#'2% (.#F("$#..% ='7/%2#0.-$0N"#% 0;;(20;6%
D$OW[J\%-F%KCY4HC T-%F0;'"'707#%F(27/#2%03?0$;#.%-$%]$#=.%?'.Q
(0"'P07'-$%2#72'#?0"%A2-N"#M^+%=#%2#"#0.#%-(2%I2-($3%72(7/%30Q
70.#7%0$3%M0 E#%-(2%.6.7#M%.-(2;#%;-3#%A(N"';"6%0?0'"0N"#C%
KEYWORDS: news articles; Wikimedia Commons; user-
generated content; data visualizations; peer production; Wikipedia
1 INTRODUCTION
O070%?'.(0"'P07'-$.%/0?#%N#;-M#%0$%'$;2#0.'$I"6%A2-M'$#$7%
A027%-F%7/#%$#=.% "0$3.;0A#% _4+% *K+% 4`aC%B$3##3+%The'N ew'York'
Times+%The'Washington'Post+%The' Wall' Street' Journal% 0$3% The'
Guardian%0""% $-=%-A#207#%#$7'2#%7#0M.%7/07%3#.'I$%0$3%A(N"'./%
3070% ?'.(0"'P 07 '-$.% _88aC% b#;#$7% 2#.#02;/% .(II#.7.% 7/07% 7/'.%
72#$3%'.%S('7#%N#$#F';'0"%F-2%7/#%2#03#2U%#MA'2';0"%#?'3#$;#%/0.%
./-=$%7/07%30 70%?'.(0 "'P07'-$.%;0$%M0E#%;-MA"#R%2#"07'-$./'A.%
'$%$#=.%027';"#.%#0.'#2%7-%;-MA2#/#$3%0$3%;0$%A2-?'3#%;2'7';0"%
;-$7#R7%F-2%$#=.%$02207'?#.%_88aC%G-2#%I#$#20""6+%;-(A"'$I%7#R7%
='7/% 3070% ? '.(0"'P07'-$.% /0.% N##$% ./-=$% 7-%'MA2-?#% ($3#2Q
.70$3'$I% 0$3% 2#;0""% -?#2% #'7/#2% 7#R7% -2% ?'.(0"'P07'-$.% 0"-$#%
_\+Y+*\a%D'$% 0;;-230$;#%='7/%7/#%=#""Q#.70N"'./#3%3(0"Q;-3'$I%
7/#-26%F2-M%;-I$'7'?# %.;'#$;#%_8*aHC%%
V/'"#%$#=.%3070%?'.(0"'P07'-$.%02#%/'I/"6%N#$#F';'0"%0$3%'$%
I2#07%3#M0$3+%;2#07'$I%7/#M%2#S('2#.%7'M#+%M-$# 6%0$3%#RA#2Q
7'.#C%T/'.% M#0$.%7/07%"-;0"%$#=.%-2I0$'P07'-$.%(.(0""6%;0$$-7%
0FF-23% 7-% M0E#% 3070% ?'.(0"' P07'-$.+% 0$3% #?#$% "02I#% $07'-$0"%
$#=.%-(7"#7.%;0$%-$"6%;2#07#%7/#M%F-2%0%7'$6%F20;7'-$%-F%027';"#.C%
b#.#02;/#2.% F2-M% '$F-2M07'-$% ?'.(0"'P07'-$% 0$3% 3070Q32'?#$%
c-(2$0"'.M% _4a% /0?#% 3#?#"-A#3% M0$6% 7--".% 7-% /#"A% 0(7-M07#%
7/'.% A2-;#..C%:-=#?#2+%0.% =#%3'.;(..%N#"-=+%7/#.#%.6.7#M.%#'Q
7/#2%.7'""%2#S('2#%.(N.70$7'0"%/(M0$%'$7#2?#$7'-$%D#CIC+%_48+%8K+%
8*+%L*aH %-2%-$"6%=-2E%='7/%?#26%.A#;'F';%7-A';0"%3-M0'$.%D#CIC+%
F'$0$;'0"%2#A-27.%_**aH%-2%?'.(0"'P07'-$%76A#.%D#CIC+%M0A.%_YaHC%
B$%7/'.% A0A#2+%=#%A2-A-.#%0$%0"7#2$07'?#%0$3%720;70N"#%0AQ
A2-0;/%7-%0(7-M07';0""6%033%;-$7#R7(0""6Q2#"#?0$7%3070%?'.(0"'Q
P07'-$.%7-% $#=.%027';"#.%='7/%$-%/(M0$%'$7#2?#$7'-$C%G-2#-?#2+%
($"'E#% #R'.7'$I% 7#;/$'S(#.+% -(2% 0AA2-0;/% /0.% ?#26% F#=% ;-$Q
.720'$7.%= '7/%2#.A#;7%7-%7-A';0"%3-M0'$%-2%?'.(0"'P07'-$%76A#C%%
d(2% 0AA2-0;/%'.%N0.#3%-$%0%$-?#"%'$.'I/7U%7/#2#%'.%-F7#$%$-%
$##3%7-%;2#07#%?'.(0"'P07'-$.% F2-M%7/#%I2-($3%(A%N#;0(.#%large'
numbers'of'data'visualizations'already'exist'in'Wikimedia'Com-
monsC%V'E'M#3'0%W-MM-$.+%-2%]7/#%W-MM-$.^%_LKa+%'.%7/#%M#Q
3'0%2#A-.'7-26%(.#3%N6%V'E'A#3'0%#3'7-2.%0$3%/0.%-?#2%L*%M'"Q
"'-$%'M0I#.+%0%$-$Q72'?'0"%A#2;#$70I#%-F%=/';/%02#%3070%?'.(0"'Q
P07'-$.% ;2#07#3% 7-% .(AA-27% V'E'A#3' 0% 027';"#.C% G-2#-?#2+% 7/#%
?'.(0"'P07'-$.%'$%7/#%W-MM-$.%/0?#%2#3'.72'N(7'-$QF2'#$3"6%"'Q
;#$.#.+%M0E'$I%7/#%W-MM -$.%0%A027';("02"6%0AA#0"'$I%2#.-(2;#%
F-2%$#=.%A(N"'./#2.C%T-%7/#%N#.7%-F%-(2%E$-="#3I#+%7/'.%A0A#2%'.%
7/#%F'2.7%7-%2#;-I$'P#%0$3%"#?#20I#%7/#%2';/%;-2A(.%-F%3070%?'.(Q
0"'P07'-$.%7/07%#R'.7.%'$%7/#%W-MM-$.C%!.%=#%3'.;(..%N#"-=+%=#%
/-A#%-(2%2#.# 02;/%;0$%.(AA-27%F(27/ #2% '$S('26%'$7-%7/#%?0"(#%-F%
7/'.%'MA-270$7%2#A-.'7-26+%N-7/%='7/'$%7/#% $#=.% ?'.(0"'P07'-$%
;-$7#R7%0$3%N#6-$3%'7C%
B$%M-2#%formal and general terms, t/'.%A0A#2%3#F'$#.%0%$#=%
A2-N"#M%7/ 07%=#%;0""%7/#%new s'data'vi sualization'retrieval'prob-
lemC%['?#$%0$%02N'72026%$#=.%027';"#+%7/#%I-0"%-F%7/#%$#=.%3070 %
?'.(0"'P07'-$%2#72'#?0"%A2-N"#M%'.%7-%0(7-M07';0""6%2#72'#?#%2#"Q
#?0$7+% A2#Q#R'.7'$I% 3070% ?'.(0"'P07'-$.% 7-% .(AA-27% 7/#% 027';"#%
F2-M%0%I'?#$%2#A-.'7-26%D'$%-(2%;0.#+%7/#%W- M M-$.HC%%
T/'.% A0A#2% '.% A(N"'./#3% ($3#2% 7/#% W2#07'?#% W-MM-$.% !772'N(7'-$% LCK%
B$7#2$07'-$0"%DWWQ9&%LCKH%"';#$.#C%!(7/-2.%2#.#2?#%7/#'2%2'I/7.%7-%3'..#M'$07#%7/# %
=-2E% -$% 7/#'2% A#2.-$0"% 0$3% ;-2A-207#% V#N% .'7#.% ='7/% 7/#% 0AA2-A2'07#%
0772'N(7'-$C%
WWW''2018,'April'23-27,'2018,'Lyon,'France.'
© 2018 IW3C2 (International World Wide Web Conference Committee),
published under Creative Commons CC BY 4.0 License.
ACM ISBN 978-1-4503-5639-8/18/04.
DOI: https://doi.org/10.1145/3178876.3186135
%%%V#%3#M-$.7207#%7/07%7/#%$#=.%3070%?'.(0"'P07'-$%2#72' #?0"%
A2-N"#M%'.'tractable%7/2-(I/%0%$#=+%#$3Q7-Q#$3%.6.7#M%;0""#3%
VizByWiki
*
C!!.%0$%#R0MA"#+%;-$.'3#2%1'I(2#%*+% =/';/%./-=.%0$%
027';"#!0N-(7%0%32-A%'$%-'"%A2';#.%7/07%'$;"(3#.%0%"'$#% ;/027% ;2#Q
07#3% N6% c-(2$0"'.7.% D1'I(2#% *0HC% 1-2% 7/'.% 0 27';"#+% X'P96V'E'%
M'$#3%7/#%W-MM-$.%0$3%2#72'#?#3%0%"'$#%;/027%0$3%0%M0A%0$3%
20$E#3%7/#M %0;;-23'$I%7-%7/#'2% (.#F("$#..%7-%7/'.%027';"#%D1'IQ
(2#%*NHC%W-MA02#3%7-%7/#%3070%?'.(0"'P07'-$.%'$%7/#%-2'I'$0"%02Q
7';"#+%7/#%"'$#%;/027%A2#.#$7.%7/#%.0M#%?02'0N"#.%D92#$7%W2(3#%-'"%
A2';#%-?#2%7'M#+%0"7/-(I/%F-2%0%"-$I#2%7'M#%20$I#HC%!33'7'-$0""6+%
7/#%7/#M 07';%M 0A% =0.%207#3%0 .% (.#F("%7-% 7/#%027';"#% N6% .7(36%
A027';'A0$7.+%N(7%=0.%$-7%'$;"(3#3%N6%7/#%$#=.%A(N"'./#2C%%
T-%F-2M0""6%#?0"(07#%X'P96V'E'%0$3%7-%"#02$%7/#%F0;7-2.%7/07%
A2#3';7%(.#F("%3070%?'.(0"'P07'-$.+%=#%3#.'I$#3%0% ;2-=3.-(2;Q
'$I%70.E%7-%;-""#;7%/(M 0$%207'$I.%-$%7/#%(.#F("$#..%-F%7/#%?'.(Q
0"'P07'-$.%2#72'#?#3%N6%X'P96V'E'%D=/';/%2#.("7#3%'$%7/#%(.#F("Q
$#..%.;-2#%F-2%7/#%M0A%'$%1'I(2#%*HC%V#% 3#M-$.7207#%7/07% X'PQ
96V'E'%;0$%#$2';/%02-($3%\KZ%-F%A-A("02%-$"'$ #%$#=.%027';"#.%
='7/% 0 7%"#0.7% -$#% .-M#= /07%(.#F("%3070%?'.(0"'P07'-$C%G-2#-?#2+%
(.'$I% 7/'.% I2-($3% 72(7/% 3070.# 7+% =#% I0'$#3% 0$% '$'7'0"% ($3#2Q
.70$3'$I% -F% 7/#% F0;7-2.% 7/ 07% 3#7#2M'$#% (.#F("% 3070% ?'.(0"'P0Q
7'-$.%F-2%$#=.%027';"#.%0$3%720'$#3%X'P96V'E'%7-%20$E%?'.(0"'P0Q
7'-$.%0;;-23'$I%7-%7/'.%($3#2.70$3'$IC%!.%=#%./-=%N#"-=+%X'PQ
96V'E'e.%20$E'$I%0;;(20;6%'.%I--3+%='7/%0$%$OW[J\%-F%KCY4C%%
d?#20""+%7/'.% A0A#2% N-7/% ;-$72'N(7#.%0% new!problem%D7/#%
news' data' visualization' retrieval' problemH%0$3% 3#M-$.7207#.%
1
Link to the system demo and source code repository: http://www.psagroup.org/pro-
jects/vizbywiki.
7/07%7/#%A2-N"#M%'.%720;70N"#%='7/%0$%end-to-end!system% ;-$Q
72'N(7' -$!DX'P96V'E'HC%V#%0".-%M0E#%7=-%-7/#2%;-$72'N(7'-$.%
=/-.#% 'MA0;7% M06%I#$#20"'P#%N#6-$3%7/#%$#=.%?'.(0"'P07'-$%
;-$7#R7C%1'2.7+%7/'.%A0A#2%/#"A.%7-%3#M-$.7207#!7/#!tremendous!
potential!of!Wikimedia!Commons+%0%2#.-(2;#%7/07%f%($"'E#%'7.%
.'.7#2% A2-c#;7% V'E'A#3'0% f%2#M0'$.% "02I#"6% ($70AA#3% N6% 7/#%
;-MA(7'$I%2#.#02;/%;-MM ($'76C%g#;-$3+%0%E#6%;-MA-$#$7%-F%
X'P96V'E'%'$?-"?#.%automatically!distinguishing!visualiza-
tions!from!non-visualizations+%0%$#=%;/0""#$I#%7/0 7%;-("3%N#%
2#"#?0$7%7-%-7/#2%3-M0'$.%D#CIC+%'M0I#%;"0..'F';07'-$HC%B$%7/'.%A0Q
A#2+%=#%./-=%7/07%7/'.%;/0""#$I#%;0$%N#%0332#..#3%='7/%0 %1*Q
.;-2#%-F%KCh*%N6%.'MA"6%(.'$I%0%A2#Q720'$#3%W-$?-"(7'-$0"%<#(Q
20"%<#7=-2E%DW<<HC%
T/#%.72(;7(2#%-F%7/'.%A0A#2%F-""-=.%N#.7%A20;7';#.%'$%7/#%.6.Q
7#M.%2#.#02;/%I#$2#%D#CIC+%_Y+%**+%*L+%84aH%N6%F'2.7%A2-?'3'$I%0$%
-?#2?'#=%-F%-(2%.6.7#Me.%;-MA-$#$7.%0$3%7/#$%-(7"'$'$I%7/#%
$-?#"%;/0""#$I#.%F0;#3%'$%#0;/%;-MA-$#$7%0$3%3#.;2'N'$I% /-=%
7/#%;/0""#$I#.%=#2#%0332#..#3C%i2'-2%7-%7/'.+%/-=#?#2+%=#%N#I'$%
N#"-=%N6%/'I/"'I/7'$I%2#"07#3%=-2EC%%
2 RELATED WORK
B$%7/'.%.#;7'-$+% =#%2#?'#= %7/#%7/2##%2#.#02;/%02#0.%7/07% M-.7%
3'2#;7"6%M-7'?07#3%7/'.%=-2EU%*H%0(7-M07#3%7#R7%'""(.7207'-$+%4H%
-7/#2% .6.7#M.%7/07%I#$#207#%$#=.%3070%?'.(0"'P07'-$.+%0$3%8H%
M#7/-3.%F-2%3070%?'.(0"'P07'-$%;"0..'F';07'-$C%
Figure 1: (a) A BBC article about the oil price drop after producers failed to agree on an output freeze and (b) the data visual-
izations retrieved by VizByWiki. In (b), the retrieved data visualizations are ranked in descending order according to readers’
perceived usefulness.
2.1 Automated Text Illustration
d$#%02#0%-F% 7/#%"'7#207(2#%7/07%A2-?'3#3%E#6%M-7'?07'-$%F-2%
7/'.%=-2E%'.%7#R7%'""(.7207'-$C%T#R7%'""(.7207'-$%' .%0%;-$.720'$#3%
'M0I#%2#72'#?0"%A2-N"#M%7/07%F-;(.#.%-$%2#72'#?'$I%'""(.7207'?#%
'M0I#.%F-2%"-$I%7#R7.C%g-M#%-F%7/'.%=-2E%/0.%F-;(.#3%.A#;'F';0""6%
-$%'""(.7207'$I%$#=.%027';"#.%='7/%M("7'M#3'0% F'"#.C%1-2%'$.70$;#+%
)'%0$3%:0'%_*ja%'""(.7207#%$#=.% 027';"#.% = '7/%'M0I#.%F2-M% 1 "';E2C%
<#=.G0 A% _*ha% -$% F(.'$I% 'M0I#% .#02;/% 2#.("7.% F2-M% M("7'A"#%
./-27%S(#2'#.%7/07%02#%I#$#207#3%F2-M%$#=.%027';"#.C%g'M'"02"6+%
O#"I03-% 0$3% ,-0-e.% .6.7#M% _\a% ;-$.72(;7#3% 0 % ?'.(0"% .7-26% -F%
$#=.% 027';"#.% N6% F ' $3'$I% .#S(#$;#% -F% ' M0I#.C% 92#0E'$I<#=.%
_4Ya%"#?#20I#3%W<<%7-%"#02$%7-%M07;/%7/#% -2'I'$0"%'M0I#.%0$3%
7/#%7#R7%-F%7/#%$#=.%027';"#CC%
g-M#%7#R7%' ""(.72 07'-$%.6.7#M .%0 ".-%(.#%'M0I#.%-$%V'E'A#Q
3'0+%N(7%F-;(.%-$%0%.M0""Q.;0"#+%.#"FQ;(20 7#3%.0MA"#%-F%7/#.#%'MQ
0I#.%D#CIC+%_8jaHC%T/#%-$"6%=-2E%7/07%;-$.'3#2.%0""%V'E'A#3'0%'MQ
0I#.%'.%N6%!I20=0"%#7%0"C% _*a+%=/-%3#.'I$#3%0%N--E% ' ""(.7207'-$%
.6.7#M% 7-% #$2';/% 7#R7N--E.% F-2% 3#?#"-A'$I% ;-($7 2' #.C% T/#'2%
M#7/-3%#..#$7'0""6% 20$E.%V'E'A#3'0% 'M0I#.%0;;-23'$I% 7-%.;-2#.%
;-MA(7#3%F2-M%7/#%7-E#$%-?#2"0 A%N#7=##$%7/#%E#6=-23.%-F%7/#%
7#R7%0$3%7/#%3#.;2'A7'-$.%-F%7/#%'M0I#C% O(#%7-%'7.%.'M'"02'76% 7-%
-(2%A2-N"#M%D(.'$I%V'E'A#3'0%'M0I#.%F-2%"-$I%7#R7H+%=#%'MA"#Q
M#$7#3%7/#'2%0"I-2'7/M%0$3%(.#3%'7%0.%0%N0.#"'$#%F-2%;-MA02'Q
.-$%'$%7/#%#?0"(07'-$%.#;7'-$+%./-=' $I%7/07%=#%-(7A# 2F-2M%7/'.%
0AA2-0;/%N6%0%.-"'3%M02I'$C%
:-=#?#2+% 7/#% news' data' visualization' retrieval' problem% '.%
0".-%F($30M#$70""6%3'FF#2#$7% F2-M%7#R7%' ""(.7207'-$%A2-N"#MC%1-2%
#R0MA"#+%;-$.'3#2%0%$#=.%027';"#%0N-(7%0'2%A-""(7'-$%'$%9#'c'$IC%
B$%7/#%7#R7%'""(.7207'-$%A2-N"#M+%7/#%'3#0"%.-"(7'-$%=-("3%N#%0%
A/-7-%3#A';7'$I%.M-I%'$%9#'c'$IC% :-=#?#2+%'$%7/#%$#=.%3070%?'.Q
(0"'P07'-$%.;#$02'-%0$%0AA2-A2'07#%'M0I#%=-("3%N#%0%N02%;/027%
./-='$I%7/#%$(MN#2%-F%306.%'$%#0;/%M-$7/%7/07%/0?#%0%iG4C\%
2#03'$I%I2#07#2%7/0$%7/#%.0F#%"#?#"C%G-2#%I#$#20""6+%7/#%$#=.%
?'.(0"'P07'-$%2#72'#?0"%A2-N"#M%'.%'$7#2#.7#3%'$%033'$I%$#=%'$Q
F-2M07'-$%7-%;-$7#R7(0"'P#%0%.7-26+%$-7%F'$3'$I%0%A/-7-%7/07%='""%
3#A';7%#R0;7"6%=/07%'.%=2'77#$%'$%7/#%7#R7C%g'M'"02"6+%?'.(0"%F#0Q
7(2#.%A"06%0%M(;/%3'FF#2#$7%2-"#%'$%7/#%3070%?'.(0"'P07'-$%A2-NQ
"#M@%3070%?'.(0"'P07'-$.%-F%7/#%.0M#%76A#%;-("3%./02#%?#26%.'MQ
'"02%?'.(0"%F#07(2#.% N(7%;-?#2%.(N.70$7'0""6%3'FF#2#$7%7-A';.%0$3%
3070C%d(2%M#7/-3-"-I';0"%;/-';#.%2#F"#;7%-(2%;-$.'3#207'-$.%-F%
7/#.#%'MA-270$7%3'FF#2#$;#.+%0$3%'.%-$#%2#0.-$%-(2%0AA2-0;/%
-(7A#2F-2M.%7/#%N0.#"'$#%M#7/-3%F-2%7#R7%'""(.7207'-$C%
2.2 Generating Data Visualizations for News
g#?#20"%2# .#0 2;/% A2-c#;7.% '$% 7/#% '$F -2M07'-$% ?'.(0"'P07'-$%
0$3% 3070Q32'?#$%c-(2$0"'.M%3-M0'$.%/0?#%.-(I/7%7-%0(7-M07#%
7/#% I2-($3Q(A% I#$#207'-$% -F% $#=.% 3070% ?'.(0"'P07'-$.C% :-=Q
#?#2+%7/#.#%.6.7#M.%#'7/#2%D*H%.7'""%2#S('2#%0%.(N.70$7'0"%3#I2##%
-F%M0$(0"%#FF-27%-2%D4H%F-;(.%-$%0%$022-=%3-M0'$%='7/'$%$#=.%
027';"#.C%!$%#R0MA"#%-F% 7/#%F-2M#2%;0.#%' .%7/#%G(;Eb0E#2% .6.Q
7#M%_48a+%=/';/%A2-?'3#.%0%(.#2%'$7#2F0;#%7-%/#"A%F'$3%0$3%?'.(Q
0"'P#%.72(;7(2#3%3070%F2-M%3070N0.#.%7/07%'.%2#"# ?0$7%7-%0%I'?# $%
$#=.%027';" #C%b#.#02;/% .6.7#M.% 7/07%0(7-M07#%7/#%;2#07'-$%-F%
.A#;'F';%;07#I-2'#.%-F%$#=.% ?'.(0"'P07'-$.%'$;"(3#%W-$7#R7'F'#2%
_**a+%=/';/%A2-3(;#.%0$$-707#3% .7-;E%?'.(0"'P07'-$.%F-2%F'$0$Q
;'0"%$#=.%027';"#.C%:-=#?#2+%7/#%.6.7#M%'.%"'M'7#3%7-%F'$0$;'0"%
$#=.%0$3%-$"6%I#$#207#.%"'$#%;/027.C%<#=.X'#=.%_Ya%'.%N('"7%7-%
0(7-M07';0""6% I#$#207#% I#-?'.(0"'P0 7'-$.% F-2% $#=.% 027';"#.%
7/2-(I/%0 %A'A#"'$#%7/07%'$?-"?#.%'3#$7'F6'$I%7-A-$6M.%0$3%7-AQ
';.+%F'$3'$I%2#"#?0$7%70N("02% 3070.#7.%0$3%;2#07'$I%0% 7/#M07';%
M0AC%)'E#% W-$7#R7'F'#2+% <#=.X'#=.%F-;(.#.%-$"6%-$%0%.A#;'F';%
76A#%-F%3070 %?'.(0"'P07'-$% D7/#M07';%M0A.H%0$3%0%.A#;'F';%76A# %
-F%027';"#%D7/-.#%='7/%.72-$I%I#-I20A/';%#"#M#$7.HC%<#=.X'#=.%
'.%F(27/#2%"'M'7#3%N6%7/#%.M0""+%M0$(0""6Q;(207#3%.72(;7(2#3%30Q
70.#7.%F2-M%=/';/% 7/# %.6.7#M%I#$#207#.%M0A.C%B$%;-MA02'.-$+%
X'P96V'E'% 3-#.%$-7%2#S('2#%/(M0$%'$7#2?#$7'-$%0$3%-A#207#.%
='7/-(7%7/#%"'M'707'-$.%-$%7/#%76A#.%-F%3070%?'.(0" 'P07'-$.+%7/#%
76A#.%-F%$#=.%027';"#.+%0$3%7/#%3'?#2.'7'#.%-F%;(207#3 3070.#7.C%%
2.3 Classifying Data Visualizations
!.%3#.;2'N#3%N#"-=+%0 %$#;#..026%.7#A%'$%X'P96V'E'%'.%3'FF#2Q
#$7'07'$I%N#7=##$%3070%?'.(0"'P07'-$.%D#CIC+%N02%;/027.%0$3%A'#%
;/027.H%0$3%-7/#2%'M0I#.%D#CIC+%A/-7-.+%#$I'$##2'$I% 3'0I20M.HC%
T/'.% A2-N"#M% '.%2#"07#3%7-%7/#%A2-N"#M%-F%'3#$7'F6'$I%3'FF#2#$7%
?'.(0"'P07'-$% M02E% 76A#.+%#CIC+% .#A0207'$I% N02% ;/027.% F2-M% A'#%
;/027.C%T/'.%'.%0%70.E%0332#..#3%'$%.#?#20"%A0A#2.%7/07%726%7-%2#Q
;-?#2%7/#%3070%N#/'$3%.707'.7';0"%;/027.C%1-2%'$.70$;#+%b#X'.'-$%
_84a%;"0..'F'#.%?'.(0"'P07'-$% 76A#.%(.'$I%0%;-MN'$07'-$%-F%7#RQ
7(0"%F#07(2#.%F2-M%dWb%0 $3%"-=Q"#?#"% ?'.(0"%F#07(2#.% =/';/%;0AQ
7(2#% A2-M'$#$7% A077#2$.% '$% 7/#% 'M0I#.C% G-2#% 2#;#$7"6+% 3##A%
"#02$'$I% /0.% N##$% (.#3% '$% 7/'.% A2-N"#M% .A0;#C% 1-2% #R0MA"#+%
W/027g#$.#% _*La%'$;"(3#.%0%M02E%76A#%;"0..'F '#2%7/07%=0.%720'$#3%
F2-M%.;207;/%(.'$I%7/#%[--I)#<#7%02;/'7#;7(2#C%g'M'"02"6+%:##2%
#7%0"C%_4ja%N('"7%0%?'.(0"'P07'-$%76A#%;"0..'F'#2%N6%F'$#Q7($'$I%0%
A2#Q720'$#3% W<<% 0$3+% (.'$I% 7/#% .0M#% F'$#Q7($'$I% 7#;/$'S(#+%
1'I(2#g##2%_8\a%.(;;#..F(""6%720'$#3%0%;"0..'F'#2%7-% 3'.7'$I('./%
3'FF#2#$7%76A#.%-F%2#.("7.%F'I(2#.%'$%2#.#02;/%A0A#2.C%%
V/'"#% X'P96V'E'% F0;#3% 0% 3'FF#2#$7% A2-N"#M% 7/0$% 7/#% 2#Q
.#02;/%3#.;2'N#3%0N-?#%D'C#C%3#7#2M'$'$I%=/ #7/#2%0$%'M0I#%'.%0%
3070%?'.(0"'P07'-$%?.C%3'.7'$I('./'$I%N#7=##$%76A#.%-F%?'.(0"'P0Q
7'-$.H+% A2#?'-(.% =-2E.e% .(;;#..% (.'$I% A2#Q720'$#3% W<<.% A2-Q
?'3#3%E#6%M#7/-3-"-I';0"%I('30$;#%F-2%-(2%0AA2-0;/C%%
3 SYSTEM OVERVIEW
B$%7/'.%.#;7'-$+%=#%F'2.7%A2-?'3#%0$%-?#2?'#=%-F%X'P96V'E'e.%
/'I/Q"#?#"% (.#2%#RA#2'#$;#C% V#% 7/#$%3#.;2'N#%X'P96V'E'e.%N#Q
/'$3Q7/#Q.;#$#.% .6.7#M% 02;/'7#;7(2#C% 1'$0""6+% =#% 3'.;(..% 7/#%
3'FF#2#$7%76A#.%-F%3070.#7.%7/07%02#%(.#3%7-%N('"3%X'P96V'E'C%
3.1 User Experience
T/#%A2'M026%0(3'#$;#%F-2%X'P96V'E'%'.%7/#%M'""'-$.%-F% A#-Q
A"#%=/-%2#03%$#=.%-$"'$#C%1-2%7/'.%0(3'#$;#+%=#%/0?#%N('"7%0%
=-2E'$I%N2-=.#2%A"(IQ'$%D3#M-$.7207#3% 0.%0%=#N%0AA"';07'-$%
'$%7/#%>b)%-$%A0I#%4H%7/07%'MA"#M#$7.%0""%7/#%7#;/$' S(#.%N#"-=C%
T/#% A"(IQ'$% A2-;#..#.% ($.72(;7(2#3% 7#R7% F2-M% 0% $#=.% 027';"#%
0$3%A2#.#$7.%(.#2.%='7/%3070%?'.(0"'P07'-$.%7/07%02#%20$E#3%N6%
7/#'2%(.#F("$#..%0"-$I.'3#%7/#%$#=.%027';"#%D0.%./-=$%'$%1'I(2#%
*NHC%!%A-7#$7'0"%.#;-$3026%0(3'#$;#%F-2%X'P96V'E'%'.%7/-.#%'$%
7/#%3070Q32'?#$%c-(2$0"'.M%;-MM($'76%_4`aC% 1-2%7/'.%0(3'#$;#+%
X'P96V'E'%;-("3%N#%(.#3% 0.%0$% #RA"-207-26%7--"%7-%'$F-2M%7/#%
3#.'I$%-F%;(.7-M'P#3%3070%?'.(0"'P07'-$.%D0.%=#""%0.%A#2/0A.% '$%
7/#%c-(2$0"'.7';%3'.;-?#26%A2-;#.. HC%%
3.2 System Architecture
B$%7/'.%.#;7'-$+%=#%A2-?'3#%0%/'I/Q"#?#"%-?#2? '#=%-F%7/#%.6.Q
7#M%0 2;/'7# ;7(2#%-F%X'P96V'E'C%!.%./-=$%'$%1'I(2#%4+%X'P96V'E'%
;-$.'.7.%-F%0%7/2##Q.70I#%A'A#"'$#U%*H%7-A';%F'"7#2'$I+%4H%3070%?'.Q
(0"'P07'-$%'3#$7'F';07'-$%0$3%8H%3070%?'.(0"'P07'-$%20$E'$IC% B$%
g70I#%*+%X'P96V'E'%(.#.%V'E'A#3'0%027';"#.%7/07%02#%2#"#?0$7%7-%
7/#%$#=.%027';"#+%0$3%7/07%;-$70'$%V'E'M#3'0%W-MM-$.%' M0I#.%
0.%0%A2-R6%7-%F'"7#2%-(7%7-A';0""6%'22#"#?0$7%'M0I#.C%T-%F'$3%2#"#Q
?0$7% V'E 'A#3'0% 027';" #.+%X'P96V'E'% F'2.7%0AA"'#.%#$7'76%"'$E'$I%
7#;/$'S(# .% 7-% '3#$7'F6% V'E'A#3'0% ;-$;#A7.k027';"#.% 7/07% 02#%
M#$7'-$#3%'$%7/#%$#=.%027';"#%7#R7C%T/#%2#.("7'$I%V'E'A#3'0%02Q
7';"#%.#7%'.%7/#$%#RA0$3#3%N6%F'$3'$I%7/#%M-.7%.#M0$7';0""6%2#Q
"07#3% V'E'A#3'0% 027';"#.% (.'$I% .#M0$7';% 2#"07#3$#..% DgbH%
M#0.(2#.C%!""%'M0I#.%'$%7/#%#RA0$3#3%.#7%-F%V'E' A#3'0%027';"#.%
02#%#R720;7#3%0$3%3#"'?#2#3%7-%g70I#%4C%
B$%g70I#%4+%7/#%#R720;7#3%'M0I#.% 02#%F'"7#2#3%7-%'.-"07#%3070%
?'.(0"'P07'-$.% F2-M% $-$Q3070% ?'.(0"'P07'-$.C% X'P96V'E'% #MQ
A"-6.% 0% 7=-Q.7#A% F'"7#2'$I% 0AA2-0;/% 7/07% ;-$.'.7.% -F% N-7/%
.720'I/7F-2=023% /#(2'.7';.% 0$3% W<<QN0.#3% 720$.F#2 % "#02$'$I%
7#;/$'S(# .C%T /#%-(7A(7%-F%g70I#%4%f%0%.#7%-F%7-A';0""6%2#"#?0$7%
3070%?'.(0"'P07'-$.%f%'.%7/#$%A2-;#..#3%N6%7/#%3070%?'.(0"'P07'-$%
20$E#2%'$%g70I# % 8C%T /#%20$E#2%=0.%720'$#3%(.'$I%0%$#=%I2-($3%
72(7/%3070.#7%7/07%=#%;-""#;7#3C%T/#%3070.# 7% ;-$.'.7.%-F%/(M0$%
c(3IM #$7.%3#.;2'N'$I% =/';/% 3070% ?'.(0"'P07'-$.%02#%(.#F("% 7-%
=/';/%$#=.%027';"#.C%
3.3 Dataset
3.3.1 Wikimedia Commons and English Wikipedia
X'P96V'E'%0(IM#$7.%$# =.%027';"#.%='7/%3070%?'.(0"'P07'-$.%
F2-M%V'E'M#3'0%W-MM-$.C%V'E'M#3'0%W-MM-$.%'.%7/#%;#$720"%
M#3'0%F'"#.%2#A-.'7-26%F-2%0""%7/#%V'E'A#3'0%"0$I(0I#%#3'7'-$.%
0$3%'.%7/#%"02I#.7%2#A-.'7-26%-F%F2##"6%"';#$.#3%#3(;07'-$0"%;-$Q
7#$7% '$% 7/#% =-2"3% _LKaC% V#% ;/-.#% V'E'M#3'0% W-MM-$.% -?#2%
-7/#2%A-A(" 02%M#3'0%2#A-.'7-2'#.%D#CIC+%1"';E2H%N#;0(.#%'7%;-$Q
70'$.%0%"02I#%$(MN#2%-F% 3070%?'.(0"'P07'-$.%3(#%7-%'7.%#$;6;"-Q
A#3';%F- ;(.C%>.'$I%0%/' I/"6%0;;(207#%3070%? '.(0"'P07'-$% ;"0..'F'#2%
3#?#"-A#3%0.%A027%-F%X'P96V'E'%D.##%N#"-=H+%=#%#.7'M07#3%7/07%
7/#% W-MM-$.% ;-$70'$.% 2-(I/"6% 7/2## % M'""' -$% 3070% ?'.(0"'P0Q
7'-$.C%V#% =#2#% 0".-% 07720;7#3% 7-% 7/#% W-MM-$.e% "';#$.'$I% 2#Q
I'M# +%0 .%7/'.%M'7'I07#.%0$6%2#0"Q=-2"3%"#I0"%N022'#2.% 7-%0$6%;-MQ
M#2;'0"% (.#% -F% X'P96V'E'C% !.% 3'.;(..#3%N#"-=+% 7/#%W-MM-$.%
A2-?#3%7-%N#%0%A-=#2F("%2#A-.'7-26%-F%3070%?'.(0"'P07'-$.%7/07%
=#%N#"'#?#%;0$%N#%"#?#20I#3%'$%;-$7#R7.%N#6-$3%7/'.%A2-c#;7C%
!$%-N.70;"#%7-%3'2#;7"6%"#?#20I'$I%V'E'M#3'0%W-MM-$.%'.%
'7.%A--2%M#703070C%T/#%V'E'M#3'0%W-MM-$.%;-MM($'76%'7.#"F%
/0.%.707#3%7/07%'M0I#.%02#%]3#.;2'N#3%-$"6%N6%;0.(0"%$-707'-$+%
M0E'$I%'7%3'FF';("7% 7-%F(""6%#RA"-2#%0$3%(.#%7/'.% 2#M02E0N"#%2#Q
.-(2;#^%0$3%#?#$%7/'.%];0.(0"%$-707'-$^%3-#.%$-7%#R'.7%F-2%0""%'MQ
0I#.% _LKaC% G-2#-?#2+% =/'"#% 7/#% W-MM-$.% N #I0 $% 0% "-$IQ7#2M%
A2-c#;7%7-%.70$3023'P#%'7.%M#703070%'$%4K*j+%7/'.%A2-;#..%'.%F02%
F2-M%F'$'./#3%_LKaC%%
T-%0332#..%7/'.%-N.70;"#+%=#%(. #3%7/#%5$I"'./% V'E'A#3'0%7-%
0(IM#$7%7/#%E$-="#3I#%7/07%=#% /0?#%0N-(7%W-MM-$.%'M0I#.C%
T/#%W-MM-$.%'.%7/#%3-M'$0$7%'M0I#%2#A-.'7-26%F-2%M-.7%V'E'Q
M#3'0%A2-c#;7.+% '$;"(3'$I%V'E'A#3'0%_L4aC%!.%.(;/+%5$I"'./%V'EQ
'A#3'0%027';"#.%0$3%7/#%7#R7%02-($3%W-MM-$.%'M0I#.%7/07%0AQ
A#02%'$%7/#.#%027';"#.%;0$%A2-?'3#%$07(20"%.#M0$7';% #MN#33'$I.%
F-2%W-MM-$.%'M0I#.C% V#%$-7#% 7/07% 7/'.%M06% N#% 0% (.#F ("% 0AQ
A2-0;/%7-%.##3%7/#%M#703070%.70$3023'P07'-$%A2-;#..%'$% 7/#% F (""%
W-MM-$.%0.%=#""C%:-=#?#2+%0%.'3#%#FF#;7%-F%7/'.%0AA2-0;/%'.%7/07%
'7%3-#.% "'M'7%-(2%A--"%-F%A-7#$7'0"% 3070% ?'.(0"'P07'-$.%7-% 7/-.#%
7/07%0AA#02%'$%07%"# 0 .7%-$#%5$I"'./%V'E'A#3'0%027';"#C%
T-% A2-;#..% 7/#% F(""% 5$I"'./% V'E'A#3'0+% =#% "#?#20I#3%
V'E'920'$%_8La+%0% .-F7=02#%F20M#=-2E%7/07%A2-;#..#.%V'E'A#Q
3'0%lG)%3(MA.%0$3%A2-?'3#.%0;;#..%7-%0%20$I#%-F%V'E'A#3'0Q
N0.#3% 0"I-2'7/M.% D'$;"(3'$I% 7/#% .#M0$7';% 2#"0 7#3$# ..% 0"I-Q
2'7/M.%=#%(.#3HC%1-2%-(2% .7(3'#.+%=#%(.#3%7/#%, ($#%48+%4K*j%
5$I"'./%V'E'A#3'0%3(MAC%
3.3.2 News Articles Datasets
T-%A#2F-2M%2#0"'.7';%#RA#2'M#$7.%-$%X'P96V'E'+%=#%.0MA"#3%
7=-%3070.#7.%-F%A-A("02%-$"'$#%$#=.%027';"#.%F2-M%M0c-2%$#=.%
-(7"#7.C%T/#%F'2.7%$#=.%027';"#%3070.#7%D=/';/%=#%;0""%7/#%ad'hoc'
3070.#7H%=0.%-2'I'$0""6%;-""#;7#3%3(2'$I%!A2'"%4K*`%DF-2%0$%#02Q
"'#2% A2-c#;7H% 0$3% ;-$.'.7.% -F% LK% A-A("02% -$"'$#% $#=.% 027';"#. %
.0MA"#3%02N'7202'"6%F2-M%7/#%/-M#%A0I#.%-F%?02'-(.%"02I#%$#=.%
!
Figure!2.!Overview!of!VizByWiki’s!architecture!
!
-(7"#7.%'$;"(3'$I%CNN+%Fox'News+%BBC+%0$3%The'New'York'TimesC%
g-M#%-F%7/# .#%$#=.%027'; "#.%02#%0;;-MA0$'#3%='7/%3070 %?'.(0"Q
'P07'-$.%3# .'I$#3%N6%#RA#27.C%V#%(.#3%7/'.%3070.#7%A2'M026%F-2%
#02"6%F#0.'N'"'76%7#.7'$IC%%
d(2%;-2#%3070.#7%D;0""#3%uniformH%=0.%;-""#;7#3%3(2'$I%,("6%
4K*j%0$3%;-$.'.7.%-F%`K%027';"#.%.0MA"#3%F2-M%7/#%3'FF#2#$7%7-AQ
';0"%;07#I-2'#.%A2-?'3#3%N6%A-A("02%$#=.%-(7"#7.C%T/#.#%027';"#.%
=#2#%.0MA"#3%7/2-(I/%7/#%bgg%F##3.%-F %Fox'News'0$3%CNN,'N-7/%
-F% =/';/' 02#% -2I0$'P#3% '$7-% 7-A';.% '$;"(3'$I% ]V-2"3^+% ]>CgC^+%
]9(.'$#..^+%]i-"'7';.^+%]T#;/$-"-I6^+%]:#0"7/^+%]5$7#270'$M#$7^+%
0$3% ]T20?#"^C%V#%20$3-M"6%.0MA"#3%7/#%.0M#%$(MN#2%-F%027'Q
;"#.%F2-M%#0;/%;0 7#I-26C%O'FF#2#$7%F2-M%7/#%A2#?'-(.%3070.#7+%
7/#.#%A-A("02%-$"'$#%027';"#.%=#2#%.#"#;7#3%='7/-(7%;-$.'3#20Q
7'-$%-F%=/#7/#2%7/#6%0"2#036%;-$70'$#3%?'.(0"'P07'-$.C%T/#%A(2Q
A-.#%-F%7/'.%3070.#7%'.%7-%#?0"(07#%7/#%I#$#20"%(7'"'76%-F%7/#%.6.Q
7#M%'$%0$% #;-"-I';0""6%?0"'3%F0./'-$+%'C#C%/-=%=#""%' 7%;-("3%2#Q
72'#?#%3070%?'.(0"'P07'-$.%F-2%02N'72026%$#=.%;-$7#$7C%%
B$%.-M#%#?0"(07'-$.+%=#%0".-%;-MN' $#3%7/#.#%7=-% 3070.# 7.%
'$7-%0%.'$I"#%*KKQ027';"#%3070.#7C%V#%;0""%7/'.%3070.#7%combinedC%
4 STAGE 1: TOPIC FILTERING
T/#%F'2.7%.70I#% '$%7/#%X'P96V'E'%A'A#"'$#%'.%'3#$7'F6'$I%7-A'Q
;0""6%2#"#?0$7%V'E'M#3'0%W-MM-$.%'M0I#.%F-2%0$%'$A(7%$#=.%
027';"#C%:#2#+%=#%(.#3%V'E'A#3'0%027';"#.%7/07%;-$70'$%7/#%V'E'Q
M#3'0%W-MM-$%'M0I#.%0.%A2-R'#.%F-2%7/#%7-A';%-F%7/#%'M0I#.C%
X'P96V'E'%3#A"-6.%0%7=-Q.7#A%A2-;#..%7/07%"#?#20I#.%N-7/%='EQ
'F';07'-$%_L+%4La%0$3%S(#26%#RA0$.'-$%7#;/$'S(#.%7-%'3#$7'F6%7/#%
0AA2-A2'07#%V'E'A#3'0%027';"#.C%
4.1 Wikification
Wikification%'$?-"?#.% 3'.0MN'I(07'$I% $0M#3% #$7'7'#.% '$%($Q
.72(;7(2#3%7#R7%7-%V'E'A#3'0%027';"#.%_4LaC%1-2%#R0MA"#+%7/#%.6.Q
7#M%;0$%2#;-I$'P#%7/07%]720?#"%N0$^%'$%7/#%2#;#$7%$#=.%027';"#%
2#F# 2.%7-%]5R#;(7'?#%d23#2%*8j`h^%D0$3%"'$E#3%7-%7/#%0..-;'07#3%
V'E'A#3'0% A0I#H% 0$3% $-7% 7=-% .#A02 07#% =-23.% D]720?#"^% 0$3%
]N0$^HC%V#%(.#%0% 7#;/$'S(#% 3#?#"-A#3% N6% <-20.#7% #7% 0"C% _4\a+%
=/';/%(.#.%/6N2'3%2("#QN0.#3%$0M#3%#$7'76%2#;-I$'7'-$%7-%3'.Q
;-?#2%7#2M.%0$3%0%"#02$#3%M-3#"%7-%3'.0MN'I(07#%7/#'2%;-22#Q
.A-$3'$I%V'E'A#3'0%#$7'7'#.C%%
%T/#%-(7A(7%-F%7/#%='E'F';07'-$%A2-; #..%;-$70 '$.%M0$6%#$7'Q
7'#.%7/07%02#%72'?'0""6%2#"07#3%7-%7/#%M0' $%7-A';%-F%7/#%027';"#%D#CIC+%
;-($72'#.%7/07%=#2# %M#$7'-$#3%'$%7/#%$#=.%027';"#%N(7%$-7%'$Q
;"(3#3%'$%7/#%"'.7%-F%N0$$#3%;-($72'#.HC%T-%F'"7#2%-(7%7/#.#%#$7'Q
7'#.+%=#%;-MA(7#3%7/#%.#M0$7';%2#"07#3$#..%DgbH%N#7=##$%7/#.#%
]='E'F'#3^%#$7'7'#.%0$3%7/#% $#=.%027';"#%;-$7# $7%0$3%-$"6%E#A7%
#$7'7'#.%7/07%02#%/'I/"6%2#"07#3% 7-% 7/#% 027';"#%;-$7#$7C%gA#;'F'Q
;0""6+%=#%"#?#20I# 3%V'E'920'$e.%'MA"#M#$70 7'-$%-F% 7/#% 5RA"' ;'7%
g#M0$7';% ! $0"6.'.% .#M0$7';% 2# "07#3$#..% 0 "I-2'7/M% D5g!H% _ja%
=/';/%M0A.%=-23.%'$7-%V'E'A#3'0Q;-$;# A7QN0.#3%#MN#33'$I.%
0$3% ;-MA(7#.%.#M0$7';%2#"07#3$#..%0.%7/#%;-.' $#% .'M'"0 2'76%-F%
7/#%#MN#33'$I.C%V#% -$"6%E#A7%='E'F'#3% #$7'7'#.% 7/07%/0?#%/03%
0$%5g!%gb%I2#07#2 %7/0$%KCY%D-(7%-F%*H%='7/%7/#%$#=.%;-$7#$7C%B$%
V'E'920'$+%0$%gb%.;-2#%-F%KCY%#FF#;7'?#"6%M#0$.%7/07% 7/'.%.;-2#%
'.%0 7%YK
7/
%A#2;#$7'"#%-F%0""%gb%.;-2#.%_8haC%
4.2 Query Expansion
T/#%#$7'7'#.%7/07%02#%-(7A(7%F2-M%7/#%gb%F'"7#2%02#%(.#3%0.%
].##3% S(#2'#.^% F-2% 2#72'#?'$I% V'E'M#3'0% W-MM-$.% 'M0I#.C%
:-=#?#2+%S(#26%#RA0$.'-$%'.%0".-%$#;#..026%3(#%7-% V'E'A#3'0e.%
-2I0$'P07'-$%0$3% '7.%2#"07'-$./'A% 7-% 7/#% 027';"#.%-$%=/';/%3070%
?'.(0"'P07'-$.%0N-(7%.A#;'F';%7-A';.%0AA#02C%1-2%'$.70$;#+% ;-$Q
7#$7%0N-(7%0%;-$;#A7%'.%-F7#$%;-$70'$#3%$-7%-$"6%' $%7/#%]M0'$%
027';"#^%0N-(7%7/#%;-$;#A7%D#CIC+%7/#%]>$'7#3%g707#.^%027';"#H%N(7%
0".-% '$% ].(NQ027';"#.^% D#CIC+% 7/#%027';"#.%]:'.7-26%-F% 7/#% >$'7#3%
g707#.^+% ][#-I20 A/6% -F% 7/#% >$'7#3% g707#.^+% ]!M#2';0$% )'7#20Q
7(2#^%0$3%.-%-$H %_4*a C%T/'.%'..(#%/0.%'MA-270$7%'MA"';07'-$.%F-2%
-(2% A2-N"#MC% W-$.'3#2% 0I0'$% 0% $#=.% 027';"#% 7/07% ;-?#2.% 7/#%
T2(MA%03M'$'.7207'-$e.%;/0$I#.%7-%>CgC%720?#"%A-"';6C%T/#%7#2M%
]'MM'I207'-$%A-"';6^%'$%7/#%027';"#%M'I/7%N#%;-22#;7"6%3'.0MN'IQ
(07#3%7-%7/#%V'E'A#3'0%027';"#%]BMM'I207'-$%i-"';6^C%:-= #?#2+%
7/'.%"'$E0I#%='""%M'..%7/#%V'E'A#3'0%027';"#%]BM M'I207'-$%i-" ';6%
-F%O-$ 0"3%T2(MA^+%=/';/%'.%0%.(NQ027';"#%-F% ]B MM'I207'-$%i-"Q
';6^% 0$3% ;-$70'$.% (.#F("% .707'.7';0"% ;/027.% 2#"07#3% 7-% F-2#'I$Q
N-2$%=-2E#2.%'$%7/#%>g%"0N-2%F-2;#C%
T-%0332#..%7/'.%'..(#+%X'P96V'E'%#RA0$3.%7/#%S(#2'#.%7-%'$Q
;"(3#%027';"# .%7/07% 02#% /'I/"6%.#M0$7';0""6%2#"07#3%7-%='E'F'#3%
#$7'7'#.%(.'$I%5g!C%g'M'"02%7-%7/#%;2'7#2'0%0N-?#+% 7-%#$.(2#%7/07%
7/#.#% #RA0$3#3%#$7'7'#.% 02#% /'I/"6%2#"#?0$7+% =#% -$"6%'$;"(3#%
7/#M%'F%7/#6%/0?#%0$%gb%m%KCY%='7/%7/#%$#=.%027';"#C%
1'$0""6+% 7/#% -(7A(7% -F% g70I#% *% ;-$.'.7.% -F% V'E'M#3'0% W-MQ
M-$.% 'M0I#.% 7/07% 02#% #R720;7#3%F2-M% 7/#% V'E'A#3'0% 027';"#.%
'3#$7'F'#3%F2-M%7/#%0N-?#%7=-%.7#A.C%%
4.3 Evaluation
%V#%?0" '307#3%7/#%F#0.'N'"' 76%-F%-(2%7-A';%F'"7#2'$I%0AA2 -0;/%
(.'$I%7/#%combined% 3070.#7C%T0N"#% *% 2#A-27.%7/07% -$% 0?#20I#+%
-(2% 0AA2-0;/% ;-("3% ]='E'F6^% `C8% V'E'A#3'0% #$7'7'#.% F-2% #0;/%
$#=.%027';"#%D0F7#2%7/#%gb%F'"7#2HC%T0N"#%*%0".-%./-=.%7/07%S(#26%
#RA0$.'-$% ='7/% .#M0$7';% 2#"07#3$#..% .(;;#..F(""6% '$;2#0.#3%
7/'.%$(MN#2%7-%*KC8C%12-M%7/#.#%V'E'A#3'0%027';"#.+%g70I#%*%#RQ
720;7#3%0$%0?#20I#% -F% `hC`% ($'S(#%'M0I#% ;0$3'307#.% F-2%#0;/%
$#=.%027';"#%D`Lj8%($'S(#%'M0I#.%'$%7-70"%F-2%7/#%combined%30Q
70.#7HC%T/#%2#.("7.%'$%T0N"#%*%3#M-$.7207#%7/07%-(2%7-A';%F'"7#2Q
'$I%0AA2-0;/%'.%0%?'0N"#%0AA2-0;/%7-%2#72'#?#%M0 $6%'M0I#%;0$Q
3'307#.%F-2%7/#%"07#2%.70I#.%-F%7/#%.6.7#M%D0"7/-(I/%'7%3-#.%$-7%
A2-?'3#%0%I(020$7##%7/07%7/'.%'.%7/#%best'.(;/%0AA2-0;/+%0%7-A';%
7-%=/';/%=#%2#7(2$%'$%O'.;(..'-$HC%%
5 STAGE 2: IDENTIFICATION
T/#%I-0"%g70I#% 4%'.%7-%'3#$7'F6%7/#%?'.(0"' P07'-$%]$##3"#.^%-(7%
-F%7/#%"02I#%]/06.70;E^%-F%3'?#2.#%'M0I#.%F2-M%g70I#%*C%%
Table 1. Performance of Stage 1: Topic Filtering
Avg. # of Wiki
articles per news
Avg. # of images
extracted per news article
Wikification
6.3
56.0
After query
expansion
10.3
69.6
5.1 Problem Definition
i2'-2% "'7#207(2#% _**a% /0.% '3#$7'F'#3% 7/07% ;-MM-$% 7 6A#.% -F%
$#=.%3070%?'.(0"'P07'-$.%'$;"(3#% M0A.+%"'$#%I20A/.+%N02%I20A/.+%
N(NN"#%;/027.+%.;077# 2A"-7.+%72## %M0A.%0$3%A'#%;/027.C%)#..%;-MQ
M-$%F-2M.% .(;/% 0.% 02#0% I20A/.% 0$3%X#$$% 3'0I20M.% 02#% 0".-%
N2-03"6% ?'#=# 3%0.%3070%?'.(0"'P07'-$.% _4j+% 84aC%B$% X'P96V'E'+%
=#%0..(M#%7/07%7/#%3#F'$'7'-$%-F%3070%?'.(0"'P07'-$.%.(N.(M#.%
0""%-F%7/#%0N-?#%76A#.C%%
!.%M#$7'-$#3%0N-?#+% ' $F-2M 07'-$%?'.(0 "'P07' -$%2#.#02;/#2.%
/0?#%#R0M'$#3%7/#%A2-N"#M%-F%3'.7'$I('./'$I%3'FF#2#$7%76A#.%-F%
3070% ?'.(0"'P07'-$.% F2-M% -$#% 0$-7/#2C% :-=#?#2+% 7/#% A2-N"#M %
=#% F0;#% /#2#%f%.#A0207'$I%3070% ?'.(0"'P07'-$.%F2-M% $-$Q3070%?'.Q
(0"'P07'-$.%f% A2#.#$7.% 7=-%M0c-2% ;/0""#$I#.% 7/07% M0E#%'7% 3'.Q
7'$;7%F2-M%A2'-2%=-2EC%1'2.7%0$3%F-2#M-.7+%($"'E#%-(2%A2-N"#M+%
2#.#02;/%-$%?'.(0"'P07'-$%76A#%;"0..'F';07' -$%(.(0""6%.7027.%=' 7/%
0%;-2A(.%;-$70'$'$I%-$"6%3070%?'.(0"'P07'-$.%_*L+%4j+%84+%8\aC%96%
;-$720.7+% V'E'M#3'0% W-MM-$.% ;-$70'$.% M(;/% M-2#% 3'?#2.#%
76A#.%-F%'M0I#.+%'$;"(3'$I%M0$6%$-$Q3070%?'.(0"'P07'-$%'M0I#.%
7/07%./02#%?'.(0"% .'M'"02' 7'#.%='7/%3070%?'.(0"'P07'-$.%D#CIC+%#$Q
I'$##2'$I%3'0I20M.+%A/-7-.%-F%A0A#2%M0A.+%"-I-.HC%g#;-$3+%3(#%
7-%7/#%;2-=3.-(2;#3%$07(2#%-F%V'E'M#3'0%W-MM-$.+%#?#$%?'.Q
(0"'P07'-$.% -F% 7/#% .0M#% 76A#% ;0$% N#% ?'.(0""6% /#7#2-I#$#-(.+%
=/';/%'$;2#0.#.%7/# %3'FF';("76%-F% ;"0..'F';07'-$C%O070.#7.%(.#3%'$%
7/#%76A#%;"0..'F';07'-$%A2-N"#M% 3-% $-7% ;-$70'$%7/'.% 3#I2##% -F%
/#7#2-I#$#'76@%7/#6%02#%-F7#$%M0$(0""6%;(207#3%N6%2#.#02;/#2.%
_84a+%3#.'I$#3%' $%7/#%.0M#%F0./'-$%F2-M%7/#%.0M# % 7--"%_4ja+%-2%
;02#F (""6%I#$#207#3%7-%.A#;'F';%.70$3023.%N6%A2-F#..'-$0".%_8\aC%
%T-%0332#..%N-7/%-F%7/#.#%;/0""#$I#.+%X'P96V'E'% ;-$70' $.%0%
7=-Q.7#A%3070%?'.(0"'P 07'-$%'3# $7'F'#2%7/07%"#?#20I#.%N-7/%7#RQ
7(0"%0 $3%?'.(0"%F#07(2#.C%V#%#RA"0'$%#0;/%.7#A%'$%3#70'"%N#"-=C%
5.1 Keyword Filtering
B$%7/#%F'2.7%.7#A%-F%-(2%3070%?'.(0"'P 07'-$%'3#$7'F';07'-$%A2-Q
;#..+%0% $0'?#%E#6=-23%F'"7#2%#"'M'$07#.%-N?'-(.%$-$Q3070%?'.(Q
0"'P07'-$.% (.'$I% 2("#QN0.#3% /#(2'.7';.C% T/#% F'"7#2% (.#.% 7#R7%
M#703070% F2-M% N-7/% V'E'A#3'0% 0$3% V'E'M#3'0% W-MM-$.C%
G#703070%;-$.'3#2#3%'$;"(3#.%'M0I#%;0A7'-$.%F2-M%V'E'A#3'0+%
F'"#%3#.;2'A7'-$.%0$3%;07#I-26%70I.%F2-M%V'E'M#3'0%W-MM-$.+%
0$3%M0;/'$#QI#$#207#3%5lB1%M#703070C%T-%3#?#"-A%-(2%/#(2'.Q
7';.+%-$#%2#. #02;/#2%=#$7%7/2-(I/%0%.#2'#.%-F%#R0MA"#%'M0I#.%
0$3%'3#$7'F'#3%E#6=-23.% 7/07% '$3';07#% -N?'-(.% $-$Q?'.(0"'P0Q
7'-$.C% T/#.#% '$;"(3#3% ]A/-7-I20A/+^% ]A';7(2#+^% ]'M0I#+^% ]F#0Q
7(2#3+^%]A/-7-+^%]A-2720'7+^%]2-03%.'I$+^%0$3%];-07.%-F%02M.C^%V#%
0".-% #R;"(3#3% 'M0I#.% 7/07% ;-$70'$% ;0M#20% 5lB1% '$F-2M07'-$%
D#CIC+%0%;0M#20%M0E#kM-3#"H+%=/';/%;"#02"6%' $3';0 7#%0$%'M0I#%
F2-M%0%3'I'70"%;0M#20%207/#2%7/0 $%0%3070%?'.(0" 'P07'-$C%d$%7/#%
combined%3070.#7+%7/#%E#6=-23%F'"7#2%'3#$7'F'#3%\j*Y%'M0I#.%0.%
-N?'-(.%$-$Q3070% ?'.(0"'P07'-$.% -(7% -F% 7/#%`Lj8%'M0I#.% F2-M%
g70I#%*e.%-(7A(7C%T/'.%"#F7%(.%='7/%j\\%;0$3'307#% 'M0I#.% 7/07%
=#2#%A-7#$7' 0""6%3070%?'.(0"'P07'-$.C%%
5.2 Image Classifier
B$% 7/#% .#;-$3% .7#A% -F% -(2% 3070% ?'.(0"'P07'-$% '3#$7'F';07'-$%
A2-;#..+%=#% 720'$#3% 0$% 'M0I#% ;"0..'F'#2% (.'$I% ?'.(0"% F#07(2#.C%
V/'"#%E#6=-23%F' "7#2'$I%#FF#;7'?#"6%.;2##$.%-(7%M0$6%obvious%
$-$Q3070% ?'.(0"'P07'-$.+% 7/#% -(7A(7% -F% 7/#% E#6=-23% F'"7#2% .7'""%
;-$70'$.%M0$6%'M0I#.%7/07%02#%$-7%3070%?'.(0"'P07'-$.C%T /'.%'.%
A2'M02'"6%7/#%2#.("7%-F%7/2##%'..(#.U%D*H%sparsenessU%M0$6% 'MQ
0I#.%-$% 7/#%W-MM-$.% /0?#% "'M'7#3% M#703070+%D4H% errorsU% 7/#%
M#703070%7/07%'.%0?0'"0N"#%;0$%N#%'$0;;(207#+%0$3%D8H%coverageU%
'7%'.%'$720;70N"#%7-%3#?#"-A%0%;-MA"#7#%.#7%-F%F'"7#2%E#6=-23.C%
%!.%.(;/+%7-%'$;2 #0.#%A 2#;'.'-$+% =#%(.#3%0%A2#Q720'$#3%;-$?-Q
"(7'-$0"% $#(20"% $#7=-2E% DW<<H% 7-% 3#.'I$% 0$% 'M0I#% ;"0..'F'#2%
7/07%3'FF#2#$7'07#.%N#7=##$% 3070%?'.(0"'P07'-$.%0$3%-7/#2% 'MQ
0I#.C%W<<.%/0?#%N##$%./-=$%7-%N#%#FF#;7'?#%'$%7/#%3070%?'.(0"'Q
P07'-$%;"0..'F';07' -$%70.E.%3#. ;2'N#3%'$%2#"07#3%=-2E%D#CIC+%_*L+%
4j+%8\aH+% N(7% 7/#6% 0".-% 2#S('2#% #$-2M-(.%$(MN#2.% -F%I2-($3%
72(7/%'M0 I#.%F-2%720'$'$IC%d$#%=06%7-%0332#..%7/'.%'.%7-%"#?#20I#%
7/#%A-=#2%-F%720$.F#2%"#02$'$I%='7/%0%A2#Q720'$#3%W<<+%0$3%=#%
03-A7#3% 7/'.% 0AA2-0;/% '$% X'P96V'E'C% gA#;'F';0""6+% =#% (.#3% 0%
A2#Q720'$#3%W<<%0.% 0%F#07(2#%#R720;7-2U%=#%"#?#20I#3%7/#%-(7Q
A(7%F2-M%7/#%.#;-$3Q7-Q"0.7%W<<%"06#2%0.%0%?#;7-2%2#A2#.#$70Q
7'-$%-F%#0;/%'M0I#+%0$3%F#3%7/#.#%2#A2#.#$707'-$.%'$7-%0%7203'Q
7'-$0"%;"0..'F'#2C%T/'.%0AA2-0;/%/0.%N##$%.(;; #..F(""6%#MA"-6#3%
-$%0%='3#%20$I#%-F%'M0I#%2#;-I$'7'-$%70.E.%_4haC%%
B$%-(2%'MA"#M#$707'-$+%=#%(.#3%7/#%B$;#A7'-$X8%W<<%A2#Q
720'$# 3%-$%BM0I#<#7%_8`a%0$3%0%.(AA-27 %?#;7-2%M0;/'$#%DgXGH%
;"0..'F'#2C%9#;0(.#%7/#%F#07(2#.%F2-M%7/#%.#; -$3Q7-Q"0.7%"06#2%-F%
7/#%A2#Q720'$#3%W<<%02#%.A02.#%0$3%/'I/Q3'M#$.'-$0"%D*K4L%3'Q
M#$.'-$.H+%F#07(2#%#$I'$##2'$I%=0.%$#;#..026%N#F-2#%720'$'$I%
-$%-(2%2#"07'?#"6%.M0""%I2-($3%72(7/%3070.#7%D3#70'"#3%N#"-=HC%
V#%0AA"'#3%i2'$;'A"#%W-MA-$#$7%!$0"6.'.%0$3%(.#3%7/#%7-AQ4K%
A2'$;'A"#% ;-MA-$#$7.% 0.% F#07(2#.% D=/';/% 0;;-($7% F-2% 0N-(7%
\*Z%?02'07'-$H C%T/#.#%4K%F#07(2#.%=#2#%$-2M0"'P#3%7-%0'3%'$%
7/#%720'$'$I%A2-;#..C%%
%T-%-N70'$%0%I2-($3%72(7/%3070.#7%7/07%0;;(207#"6%2#A2#.#$7.%
7/#%($3#2"6'$I%3070%3'.72'N(7'-$%-F%7/'.%A2-N"#M+%=#%"0N#"#3%0""%
j\\%'M0I#.%7/07%=#2#%-(7A(7%F2-M%-(2% E#6=-23%F'"7#2'$I%.7#AC%
g'$;#%M0$(0""6% 3'FF#2#$7'07'$I% 3070% ?'.(0"'P07'-$.% F2-M% -7/#2%
'M0I#.%'.%0% 2#"07'?#"6%($0MN'I(-(.%70.E%D($3#2%7/#% ;-$;2#7#%
3#F'$'7'-$%-F%$#=.%3070%?'.(0"'P07'-$.%#$(M#207#3%0 7%7/#%N#I'$Q
$'$I%-F% 7/'.%.#;7'-$H+%-$#% 2#.#02;/#2%M0$(0""6% .-27#3%-(2%'MQ
0I#.%'$7-%0%3070%?'.(0"'P07'-$%;"0..%DL\\%'M0I#.H%0$3%0%$-$Q3070%
?'.(0"'P07'-$% ;"0..%D8KK%'M0I#.HC%T/'.% 3070.#7% =0.% 7/#$% .A"'7%
'$7-%0%3#?#"-AM#$7%.#7%D\KZH+%=/';/%=0.%(.#3%7-%720'$%0$3%7($#%
7/#%/6A#2A020M#7#2.%-F%7/#%gXG+%0$3%0$%#?0"(07'-$%.#7%D\KZH+%
=/';/% =0.% (.#3% 7-% #?0"(07#%;"0..' F'#2%A#2F-2M0$;#C% V#% (.#3%
I2'3%.#02;/%7-%7($#%0%?02'#76%-F%/6A#2A020M#7#2.C%%
T/#%N#.7%A#2F-2M'$I%gXG%;"0..'F'#2%=0.%F-($3%7-%#MA"-6%0%
203'0"% N0 .'.% F($;7'-$%Db91H%E#2$#"%='7/%
𝛾 = 0.01
%0$3%
𝐶 = 100
C%
T0N"#%4% ./-=.%7/#%2#.("7.%-F%7/'.%;"0..'F'#2%-$%-(2%7#.7%3070.#7C%
V#% =#2#% 0N"#% 7-% 0;/'#?#% 0$% 0?#20I#% 1*% .;-2#% -F% KCh*% 0$3%
Table 2. Performance of image classifier
Class
Precision
Recall
F1-score
non-dataviz
0.89
0.88
0.89
dataviz
0.93
0.91
0.91
avg
0.91
0.91
0.91
2-(I/"6%#S(0""6%/'I/%1*%.;-2#.%F-2%N-7/%7/#%$-$Q3070%?'.(0"'P0Q
7'-$%; "0..%DKCYhH%0 $3%7/# %3070%?'.(0"'P07'-$%;"0..%DKCh*HC%T/#.#%
02#%'MA-270$7%2#.("7.%F-2%7=-%2#0.-$.C%1'2.7+%7/#6% 2#A2#.#$7%0%
M-2#Q7/0$Q03#S(07#%-?#20""%0;;(20;6%F-2%-(2%X'P96V'E'%A2-7-Q
76A#+%0""-='$I%(.%7-%;-$7'$(#%7-%7/#%g70I#%8%20$E'$I%70.EC% g#;Q
-$3+%7/#6%'$3';07#%7/07%7/#%?'.(0"%F#07(2#.%"#02$#3%F-2%3#7#;7'$I%
-Nc#;7.% '$% $07(20"% .;#$#% 'M0I#.% D-$#% -F% 7/#% M0'$% BM0I#<#7%
70.E.H %02#%(.#F("%F-2%'3#$7'F6'$I%3070%?'.(0"'P07'-$.%D=/';/%02#%
M-.7"6%;-MA(7#2QI#$#207#3%='7/%?#26%3'FF#2#$7%?'.(0"%;/020;Q
7#2'.7' ;.H+%0 %F'$3'$I%7/07%3# .#2?#.%M-2#%#RA"-207'-$C%%
6 STAGE 3: RANKING
B$%g70I#%*+%X'P96V'E'%#R720;7.%'M0I#.%2#"#?0$7%7-%0$%'$A(7%
$#=.% 027';"#% F2-M% V'E'M#3'0% W-MM-$.% 0$3% '$% g70I#% 4+% X'PQ
96V'E'%F'"7#2.%-(7%'M0I#.%7/07%02#%$-7%3070%?'.(0"'P07'-$.C%T/#%
I-0"%-F%g70I#%8%f%7/#%F'$0"%.70I#%f%'.%7-%20$E%7/#%3070% ?'.(0"'P0Q
7'-$.%-(7A(7%F2-M%g70I#%4%0;;-23'$I%7-%7/#'2%(.#F("$#..% 7-%7/#%
2#03#2%D0.%'$%1 'I(2#%*NHC%B$%7/ '.% .# ;7'-$+%=#%F'2.7%3'.;(..%/-=%=#%
F-2M("07#3%7/#%g70I#%8%A2-N"#M%'$7-%0%]"#02$'$I%7-%20$E^%A2-NQ
"#MC%V#%7/#$%3-;(M#$7%/-=%=#%;-""#;7#3%0%$-?#"%?'.(0"'P07'-$%
(.#F("$#..% I2-($3% 72(7/% 3070.#7% 7/2-(I/% ;2-=3.-(2;'$IC%
)0.7"6+%=#%(.#% 7/'.%3070.#7%7-%;-$3(;7%7=-%'MA-270$7%#?0"(0Q
7'-$.U%-$#%F -2%7/# % -?#20""%.6.7#Me.%I#$#20"%F#0.'N'"'76%0$3%7/#%
-7/#2%.A#;'F';0""6%F-2%7/#%20$E#2e.%A#2 F-2M0$;#C%
6.1 Problem Formulation
V#%F-2M("07#3%g70I#%8%0.%0%"#02$'$I%7-%20$E%A2-N"#M%.'M'"02%
7-%7/#%-$#%7/07%'.%76A';0"%F-2%.#02;/%#$I'$# .U%I'?#$%0%$#=.%027'Q
;"#+%-(2%I- 0"%=0.%7-%20$E%0%.#7%-F%3070%?'.(0"'P07'-$.% N6%7/#' 2%
(.#F("$#..%7-%7/#%$#=.%027';"#C%O(#%7-%7/#%$-?#"76%-F%7/'.%A2-NQ
"#M+% =#% /03% 7-% ;-$.72(;7% -(2% -=$% I2-($3% 72(7/% 3070.#7C% T-%
N('"3%7/'.%3070.#7+%=#%(.#3%7/#%3070%?'.(0 "'P07'-$.%-(7A(7%F2-M%
-(2%=-2E%'$%g70I#%4%0$3%A0'2#3%7/#M%='7/%7/#'2%;-22#.A-$3'$I%
$#=.%027';"#.C%V#%M0$(0""6%;-22#;7#3%0""%;"0..'F';07'-$%M'.70E#.%
'$%-23#2%7-% #$.(2#%0%3070Q?'.(0"'P07'-$Q-$"6%3070.#7+%0""-='$I%
(.%7-%F-;(.%A(2#"6%-$%7/#%20$E'$I%70.EC%T/'.% 3070.#7%;-$.'.7.%-F%
\j4%nnews'article+%candidate'data'visualizationo%A0' 2.%D<-7#%7/07%
-$#%3070%?'.(0"'P07'-$%;0$%N#%A0'2#3%='7/%M("7'A"#%027';"#.HC%%
1-2% #0;/% A0'2+% =#% I#$#207#3% N-7/% 7#R7(0"% 0 $3% ?'.(0"% F#0Q
7(2#.C%gA#;'F';0""6+%7/# %7#R7(0"%F#07(2#.%02#%0.%F-""-=.U%
%
co ntent-captionU% T/#% .#M0$7';% 2#"07#3$#..% .;-2#% D;-MQ
A(7#3%N6%5RA"';'7%g#M0$7';%!$0"6.'.H%N#7=##$%7/#%$#=.%02Q
7';"#%;-$7#$7%0$3%7/# % ?'.(0"'P07'-$%;0A7'-$%=2'77#$%N6%7/#%
#3'7-2.%-F%7/#%V'E'A#3'0%027';"#%'$%=/';/%7/#%'M0I#%'.%(.#3%
D7/#%027';"#%'3#$7'F'#3%'$%7/#%g70I#%*HC%<-7#%7/07%#?#$%7/-(I/%
M#703070%F-2%'M0I#.%'.%.A02.#%0$3%;0$%N#%'$0;;(207#%'$%7/#%
W-MM-$.+%0"M-.7%0""%'M0I#.%/0?#%0%;0A7'-$%=/#$%7/#6%02#%
'$;"(3#3%'$%V'E'A#3'0C%
co ntent-WPtitleU%T/#%.#M0$7';%2#"07#3$#..%.;-2#%N#7=##$%
7/#%$# =.%027';"#%;-$7# $7%0$3%7/# %7'7"#%-F%7/#%V'E'A#3'0 %027'Q
;"#%7/07%;-$70' $.%7/#%;0$3'307#%'M0I#%
title-cap tionU%T/#%.#M0$7';%2#"07#3$#..%.;-2#%N#7=##$%7/#%
$#=.%027';"#%7'7"#%0$3%7/#%?'.(0"'P07'-$%;0A7'-$C%
T/#%?'.(0"%F#07(2#.%02#%M(;/%.'MA"#2U%
%
CNNembedU%T/#.#%02#%7/#%.0M#%F#07(2#.%7/07%02#%(.#3%7-%
720'$%7/#%'M0I#%;"0..'F'#2%'$%g70I#%4C%T/#6%02#%7/#%7-A%4K%
A2'$;'A0"%;-MA-$#$7.%F2-M%7/#%*K4LQ3'M#$.'-$%F#07(2#.%
#R720;7#3%F2-M%0%A2#Q720'$# 3%W<<C%
%
1-2%-(2%20$E'$I%0"I-2'7/M+%=#%(.#3%7/#%A-A("02%b0$EgXG%
_*8a+%=/';/%#MA"-6.%0%A0'2='.#%M#7/-3%7/07%'.%720'$#3%7-%M'$'Q
M'P#%7/#%$(MN#2%-F%' $?#2.'-$.C%V#%'MA"#M#$7#3%7/#%b0$EgXG%
0.%0%"'$#02%E#2$#"%gXG%(.'$I%7/#%i67/-$%A0;E0I#%.;'E'7Q"#02$C%%
6.2 Collecting Ground Truth Ratings
1-2%#0;/%nnews' article+%candidate'data'visualizationo%A0'2+%=#%
(.#3%;2-=3.-(2;'$I%7-%;-""#;7%I2-($3%72(7/%(.#F("$#..%207'$I.C%
T/'.%3070.#7%0""-=#3%(.%7-%0..#..%7/#%F#0.'N'"'76%-F%7/#%$#=.%3070%
?'.(0"'P07'-$% 2#72'#?0"% A2-N"#M% Dg#;7'-$% `C8H% 0$ 3% 7-% "#02$% 7-%
20$E%7/#%3070 %?'.(0"'P07'-$.%F-2%#0;/%$#=.%027';"#%Dg#;7'-$%`CLHC%%
2.2.1' Task'UI.%T/#% ;2-=3.-(2;'$I%A"07F-2M% =#% (.#3% =0.%
!M0P-$%G#;/0$';0"%T(2E%DGT(2EHC%1'I(2#%8%./-=.%7/#%>B%-F%7/#%
70.E% D2#Q.; 0"#3% F-2% ;"02'76HC% >A-$% 0;;#A7'$I% -(2% 70.E+% 0%
;2-=3=-2E#2%D]T(2E#2^H%=0.%./-=$%0%70.E%7(7-2'0 "%0$3%0$%#RQ
0MA"#%'.%A2-?'3#3C%T/#%T(2E#2%=0.%7/#$%20$3-M"6%0..'I$#3%7-%
-$#%-F%7/#%* KK%$#=.%027';"#.%'$%7/#%combined%;-2A(.C%%
%!F7#2%2#0 3'$I%7/#%027';"#+%7/#%T(2E#2%=0.%A2#. #$7#3%='7/%0""%
7/#% ;0$3'307#% 3070% ?'.(0"'P07'-$.% 7/07% 02#% #R720;7#3% N6% X'PQ
96V'E'%F-2%7/#%027';"#%D-(7A(7%-F%g70I#%4H%0$3%=0.% 0.E#3%7-%207#%
#0;/%?'.(0"'P07'-$%-$%0%.;0"#%-F%KQ8%0;;-23'$I%7-%/-=%(.#F("%7/#%
3070%?'.(0"'P07'-$%'.C%50;/%3070%?'.(0"'P07'-$%=0.%0;;-MA0$'#3%
='7/%'7.%-2'I'$0"%V'E'A#3'0%027';"#%;0A7'-$%0$3%;-("3%N#%;"';E#3%
7-%P--M%'$%'F%7/#%T(2E#2%=0$7#3%7-%#R0M'$#%'7.%3#70'".C%%
>.#F ("$#..%=0.%0..#..#3%-$%0%F-(2QA-'$7%.;0"#U%K%p%]$-7%(.#Q
F("^@%*%p%].-M#=/07%(.#F("^@%4%p%](.#F("^@%8%p%]?#26%(.#F("^HC%!%
(.#F("%?'.(0" 'P07'-$ %=0.%3#F'$#3%0.%-$#%7/07%]/#"A#3%#RA" 0'$%-2%
A2-?'3#%;-$7#R7^%7-%7/#%027';"#C%V#% ;-$.'3#2#3% #?0"(07'$I% 7/#%
2#72'#?#3%?'.(0"'P07'-$.%-$%?02'-(.%"-=#2Q"#?#"% ;/020;7#2'.7';.%
Figure!3.!MTurk!task!UI!for!usefulness!ratings!collec-
tion.!News!article!attenuated!due!to!length.!!
F2-M%7/#%?'.(0"'P07'-$%3-M0'$+%#CIC+%#RA2#..'?#$#..%_44a%0$3%'$Q
7#2#.7' $I$#..%_Ya%f%207/#2%7/0$%(.#F("$#..C%:-= #?#2+%=#%3#7#2Q
M'$# 3%7/07%(.#F("$#..+%0.%3#F' $#3%0N-?#+%=-("3%M-2#%3'2#;7"6%
;0A7(2#%-?#20""%(.#2%#RA#2'#$;#%07%7/'.%.70I#%-F%7/#%#RA"-207'-$%
-F%7/#%$#=.%3070%?'.(0"'P07'-$%A2-N"#MC%%
2.2.1' Imp roving' the' quality' of' crowdsourced' d ata.% !.% '.%
=#""QE$-=$%'$%7/#%/(M0$%;-MA(707'-$%3-M0'$+%;2-=3.-(2;#3%
3070%'.%.(Nc#;7%7-%S(0"'76%'..(#.%.(;/%0.%spam+%errors,%0$3%biases%
_*4aC% 1-2% '$.70$;#+% ($"#..% A2#;0(7'-$.% 02#% 70E#$+% .-M#%
;2-=3=-2E#2.%='""%F'""%'$%20$3-M%0$.=#2.%='7/-(7%2#03'$I%7/#%
S(#.7'-$.%'$% -23#2% 7-% M0E#%0.%M(;/% M-$#6% 0.% A-..'N"#% '$%0%
./-27% A#2'-3%-F% 7'M#C%!.%.(;/+%=#%'MA"#M#$7#3%7/#% F -""-='$I%
.7207#I'#.%'$%-(2%70.E%7-%'MA2-?#%;2-=3.-(2;'$I%S(0"'76U%
%
*H 1-""-='$I%W/0$I%#7%0"C%_8a+%=#%#$.(2#3%7/07%'$%7/#%F'$0"%30Q
70.#7+%$-%-$#%=-2E#2%3'3%M-2#%7/0$%\Z%-F%0""%7/#%70.E.C%T/'.%
.'MA"#%7#;/$'S(#%#FF#;7'?#"6%#"'M'$07#.%"02I#Q.;0"#%.A 0MC%
4H T-% M'$'M'P#% 7/#% #FF#;7% -F% A#2.-$0"% N'0.#.% 0$3% ($'$7#$Q
7'-$0"%#22-2.+%=#%2#"6%-$%2#3($30$;6%_*4aC%1-2%#0;/%A0'2%-F%
nnews'article+%candidate'data'visualizationo+%=#%;-""#;7#3%L%
207'$I.%0$3%(.#3%7/#%M#3'0$%207'$I%0.%7/#%F'$0"%207'$IC%
8H V#%033#3%?#2'F';07'-$%S(#.7'-$.%0.%.(II#.7#3%N6%q'77(2%#7%
0"C%_*`aC% !F7#2%=-2E#2.%2#03%7/#%$#=.%027';"#% 0$3% N#F-2#%
7/#6%.7027#3%207'$I%7/#%3070%?'.(0"'P07'-$.+%=-2E#2.%=#2#%
2#S('2#3%7-%0$.=#2%0% M("7'A"#Q;/-';#% S(#.7' -$% 0 N-(7%0$%
-N?'-(.%F0;7%'$%7/#%027';"#C%%
%
B$%7-70"+%F-2%\j4%A0'2.%-F%nnews'article+%candidate'data'visual-
izationo+%=#%;-""# ;7#3%44YY%207'$I.% DLr\j4HC%T-%F0;'"'707#%F(27/#2%
03?0$;#M#$7%-$%7/#%$#=.%3070%?'.(0"'P07'-$%2#72'#?0"%A2-N"#M+%
=#% 02#% 2 #"#0.'$I% -( 2%I2-($3% 7 2(7/%3070%D.##%.6.7#M%>b)% 0N-?#HC%%
6.3 Evaluation 1: General Feasibility of the News
Data Visualization Retrieval Problem
V'7/%7/#% I2-($3% 72(7/%3070+%0$% 'MA-270$7% S(#.7'-$%7-% 03Q
32#..%#?#$%N#F-2#%720'$'$I%7/#%20$E#2%2#I023.%7/#%F#0.'N'"'76%-F%
7/#%N2-03#2%$#=.%3070%?'.(0"'P07'-$%2#72'#?0"%A2-N"#MC%B$%-7/ #2%
=-23.+%;0$%V'E'M#3'0%W-MM-$.%A2-?'3#%(.#F("%?'.(0"'P07'-$.%
F-2%0%$-$Q72'?'0"%$(M N#2%-F%A-A("02%$#=.%027';"#.s%
T-%0$.=#2%7/'.%S(#.7'-$+%=#%(.#3%7=-%M#72';.U%*H%F-2%0 %I' ?#$%
$#=.%027';"#+%/-=%M0$6%]I--3^%3070%?'.(0"'P07'-$.%;-("3%N#%2#Q
72'#?#3%N6%X'P96V'E'% 0$3%4H%/-=%M0$6%$#=.%027';"#.%;-("3%N#%
0(IM#$7#3% N6%07%"#0.7%-$#%]I--3^%3070%?'.(0"'P07'-$C%V#%(.#3%
7=-%3#F'$'7'-$.%F -2%]I--3^%3070%?'.(0"'P07'-$.U%*H%3070%?'.(0"'Q
P07'-$.%='7/%T(2E#2.e%M#3'0$%20$E'$I%I2#07#2%-2%#S(0"%7-%*%D'C#C%
somewhat' usefulH+%4H%3070% ?'.(0"'P07'-$.%='7/%T(2E#2.e%M#3'0$%
20$E'$I%I2#0 7#2%-2%#S(0"%7-%4%D'C#C%usefulHC%
T0N"#%8%2#A-27.%7/#% F#0.'N'"'76%#?0"(07'-$%-F% X'P96V'E'C%G-.7%
'MA-270$7"6+% F-2% -(2% ;-2#% $#=.% 3070.#7% uniform+% X'P96V'E'%
;-("3%2#72'#?#%07%"#0.7%-$#%somewhat!useful!3070%?'.(0"'P07'-$%
F-2%LYC8Z%-F%027';"#.%0$3%;-("3%2#72'#?#%07%"#0.7%-$#%useful%3070%
?'.(0"'P07'-$%F-2%4*CjZ% -F%027';"#.C%b#;0""%7/07%-(2%uniform%30Q
70.#7%;-$70' $.%A-A("02%-$"'$#%$#=.%027';"#.%7/07%02#%20$3-M"6%
0$3%($'F-2M"6%.0MA"#3%F2-M%3'?#2.#%7-A';.C%:#$;#+%-(2%2#.("7.%
3#M-$.7207#%7/07%-(2%0AA2-0;/%7-%$#=.%?'.(0"'P07'-$%2#72'#?0"%
0$3%7/#%(.#%-F%7/#%X'P96V'E'%.6.7#M%.A#;'F';0""6%could'result'in'
anywhere'from'one-fifth'to'one-half'of'popular'online'news'arti-
cles'being'enhanced'by'at'least'one'data'visualizationC%V#%$-7#%
7/07%=#%-N.#2?#3%2-(I/"6%7/#%.0M#%2#.("7.%F-2%7/#%ad'hoc'30Q
70.#7%0.%=#""C%%
T-%($3#2.70$3%7/#%027';"#.%F-2% =/';/%X'P96V'E'%3'3%$-7%2#Q
72'#?#%07%"#0.7%-$#%somewhat!useful%3070%?'.(0"'P07'-$+%-$#%2#Q
.#02;/#2%;02#F(""6%2#03%-?#2%4K%.(;/%027';"#.C%V#%F-($3%7/07%*H%
M-.7% -F% 7/#.#% 027';"#.% 3'3% $-7% #RA"';'7"6%2#F#2#$;#% 0$6% .72(;Q
7(2#3%3070%'$%7/#%7#R7%0$3%4H%.-M#% -F%7/#.#%027'; "#.%;-?#2#3%?#26%
2#;#$7%N2#0E'$I%$#=.%'$%=/';/%3070%M'I/7%$##3%7-%N#%I07/#2#3%
-2%(A307#3%S(';E"6%D#CIC+%0$%#027/S(0E#+%#"#;7'-$HC%V#%2#7(2$%7-%
N-7/%-F%7/#.#%A-' $7.%N# "-=C%
)--E'$I%07%7/#%$(MN#2%-F%3070%?'.(0"'P 07 '-$.%7/07%=#2#%2#Q
72'#?#3%F-2%#0;/%027';"#+%X' P96V'E'%=0.%0N"#%7-%2#72'#?#%0$%0?#2Q
0I#%-F%LC`%somewhat!useful%?'.(0"'P07'-$.%0$3%8C\%useful%?'.Q
(0"'P07'-$.%F-2%-(2%;-2#%3070.#7%uniform'D'$;"(3'$I%0""%7/#%P#2-.%
F-2%027';"#.%F-2%=/';/%$-%?'.(0"'P07'-$% ;-("3%N#%2#72'#?#3HC% !%
.'M'"02%72#$3%;0$%N#%-N.#2?#3%F-2%7/#%ad'hoc%3070.#7C%:-= #?#2+%
7/#% 0;7(0"% $(MN#2% -F% I--3% 3070% ?'.(0"'P07'-$.% ?02'#.% .'I$'F'Q
;0$7"6%0;2-..%027';"#.C%!.%.(;/+%'7%'.%2#0.-$0N"#%7-%;-$;"(3#%7/07%
0"7/-(I/%-(2%0AA2-0;/%"'E#"6%'.%F#0.'N"#%F-2%"02I#Q.;0"#%?'.(0"'Q
P07'-$%#$/0$;#M#$7+%7/#%A#2F-2M0$;#%-F%-(2% 0AA2-0;/%'.%N#77#2%
F-2%.-M#%027';"#.%7/0$%F-2%-7/#2.C%T/#.#%2#.("7.%0".-%.(II#.7%7/07%
F-2%027';"#.%F-2%=/';/% X'P96V'E'% ;0$%A2-3(;#% (.#F("%?'.(0"'P0Q
7'-$.+%20$E'$I%'.%'MA-270$7%0.%7/#2#%'.%-F7#$%0%$-$Q72'?'0"%$(MQ
N#2%-F%?'.(0"'P 07'-$.%A#2%027';"#C%V#%0332#..%7/'.%20$E'$I%A2-NQ
"#M%'$%7/#%$#R7%.(NQ.#;7'-$C%
6.4 Evaluation 2: Performance of the Ranker
1-2%-(2%20$E'$I%#RA#2'M#$7+%=#%(.#3%7/#%combined'3070.#7C%
V#%(.#3%\KZ%-F%7/#%3070.#7%0.%0%3#?#"-AM#$7%.#7%D720'$'$I%0$3%
/6A#2A020M#7#2%7($' $IH+%0$3%/#"3%-(7%\KZ%F-2%#?0"(07'-$C%%
V#%0..#..#3%-(2%3070%?'.(0"'P07'-$%20$E#2%(.'$I%7/#%;-$?# $Q
7'-$0"% .#02;/% #$I'$# % #?0"(07'-$% 0AA2-0;/% '$?-"?'$I% 7/#%
nDCG@k%M#72';C%T-%A(7%-(2%20$E#2e.%A#2F-2M0$;#%'$7-%;-$7#R7+%
=#% 033'7'-$0""6% 'MA"#M#$7#3% 0% 20$E'$I% 0"I-2'7/M% F2-M%
!I20=0 "%#7%0 "C%_*a+%=/';/%'$?-"?#3%20$E'$I%V'E'A#3'0%'M0I#.%F-2%
7/#'2%2#"#?0$;#% 7-% 0% A027';("02% .#;7'-$% F2-M% 0% 7#R7N--EC% T/'.%
M#7/-3%#..#$7'0""6%2#"'#.% -$% 7-E#$%-?#2"0A%N#7=##$% 7/#%E#6Q
=-23.% -F% 7/#% 7# R7% 0$3% 7/#% 3#.;2'A7'-$.% -F% 7/#% 'M0I#C% T/#%
!I20=0 "%#7%0"C%0AA2-0;/%A2-?'3#.%0%(.#F("%N0.#"'$ #%7-%/#"A%($Q
3#2.70$3%-(2%20$E# 2e.%A#2F-2M0$;#C%%
T0N"#%L%./-=.%7/#%A#2F-2M0$;#%-F%-(2%20$E#2%720'$#3%-$%3'FQ
F#2#$7%F#07(2#% .#7.%0$3%;-MA02#3%7-%7/#%N0.#"'$#%M#7/-3%N6%
!I20=0 "%# 7%0"C%V#%;-MA(7#3%nDCG@k'D=/#2#%k%p%8+%\%0$3%jH%F-2%
Table 3. Feasibility evaluation of VizByWiki.
Metric
Uniform
Ad hoc
% of articles with ³ 1 somewhat useful dataviz
48.3%
52.5%
% of articles with ³ 1
useful
dataviz
21.7%
27.5%
avg. # somewhat useful dataviz per article
4.6(5.3)
4.2(3.9)
avg. # useful dataviz per article
3.5(4.0)
2.7(1.7)
Note: Standard deviations shown in parentheses where relevant.
#0;/% $#=.% 027';"#% 0$3% 7/#% M#0$% 0$3% .70$3023% 3#?'07'-$% -F%
nDCG@k%'.%=/07%'.%./-=$%'$%T0N"#%LC%T/#%2#.("7.%'$%2-=%4%./-=%
7/07%-(2%20$E#2%720' $#3%-$%7#R7(0"%F#07(2#.%0"-$#%-(7A#2F-2M.%
7/#%N0.#"'$#%F2-M%A2'-2%=-2E%N6%0%.-"'3%M02I'$C%1-2% F(27/#2% ;-$Q
7#R7+%=#%033'7'-$0""6%.##% 7/07%7/'.%?#2.'-$%-F%7/#%20 $E#2e.%A#2Q
F-2M0$;#%D#CIC+%nDCG@5'p%KCY4H%'.%;-MA020N"#%7-%A0. 7%2#.#02;/%
'$%7/#%=#N%.#02;/%3-M0'$%7/07%0".-%'$?-"?#3%3#F'$'$I%$#=%A2-NQ
"#M.%D#CIC+%_`+%8YaHC%T/#%2#.("7.%'$%2-=%8%02#%0".-%S('7#%'$F-2M0Q
7'?#C%T/#6%./-=%7/07%033'$I%?'.(0"%F#07(2#.%decreases%20$E'$I%
S(0"'76C%d(2%/6A-7/#.'.%/#2#%'.%7/07%($"'E#%'.%7/#%;0.#%F-2%M0$6%
-7/#2%'M0I#%2#72'#? 0"%A2-N"#M.+%-(2%'M0I#.%;0$%N#%?'.(0""6%.'MQ
'"02%=/'"#%N#'$I%.#M0$7';0""6%S('7#%3'FF#2#$7C%T/07%'.+%3070%?'.(Q
0"'P07'-$.%-F%7/#%.0M#%76A#%D#CIC+%N02%;/027.H %"--E%2-(I/"6%7/#%
.0M#+%N(7%0"M-.7%0"=06.%;-?# 2%.(N.70$7'0""6%3'FF#2#$7%7-A';.C%
1-;(.'$I%-(2%077#$7'-$%-$%-(2%7#R7(0 "%F #07(2#.+%=#%F(27/#2%
'$?#.7'I07#3%3'FF#2#$7%A#2M(707'-$.%-F%7/#.#%F#07(2#.C%T/#%;-MQ
N'$07'-$% -F% content-WPtitle! D .#M0$7';% 2#"07#3$#..% N#7=##$%
$#=.%;-$7#$7%0$3%V'E'A#3'0%027';"#H!0$3!content-caption!D.#Q
M0$7';%2#"07#3$#..!N#7=##$%7/# %$#=.%;-$7#$7%0$3%'M0I#%;0AQ
7'-$H%=-2E.%7/#%N#.7C%!%20$E#2%720'$#3%='7/%7/#.#% 7=-%F#07(2 # .%
0;/'#?#3% 0%KCY4%$OW[J\+%=/';/%'.%.'M'"02%7-%7/#%A#2F-2M0$;#%
-F%7/#%20$E#2%720'$#3%-$%0""%7#R7(0"%F#07(2#.C%T/#%title-caption%
F#07(2#%Dg b%N #7=##$%7'7"#%0$3%7 /#%;0A7'-$H+%/-=#?#2+%0AA#02.%7-%
N#%"#..%#FF#;7'?#C%!.%.(;/+%F-2%2#0.-$.%-F%N-7/%A#2F-2M0$;#%0$3%
A02.'M-$6+%=#%(.#%7/#%M-3#"%720'$#3%='7/%c(.7%content-WPti-
tle!0$3!content-cap tion!'$%7/#!F'$0"% X'P96V'E'%.6.7#MC%V'7/%
7/'.%M-3#"+%F-2%'$.70$;#+%7/# %.6.7#M%=0.%0N"# %7-%I'?#%0%7-A%20$E%
7-%7/# %?'.(0"'P07'-$%'$%1'I (2#%8%F-2%7/#%027';"#%'$%1'I(2#%8%D 0.%'$Q
3';07#3%N6%7/#%.;-2#%'$%1'I(2#%8HC%%
6 DISCUSSION
T/#%0N-?#%#?0"(07'-$.%./-=#3%7/07%X'P96V'E'%'.%0N"#%7-%2#Q
72'#?#%(.#F("%?'.(0"'P07'-$.%F-2%(A%7-%0AA2-R'M07#"6%/0"F%-F%A-AQ
("02%-$"'$#%$#=.% 027';"#.%-F%3'?#2.#%76A#.%0$3%'.%0N"#%7-%20$E%
7/#M%='7/%2#0.-$0N"#%S(0"'76C%:-=#?#2+%'7%'.%'MA-270$7%7-%A-' $7%
-(7%7/07%7/#%.6.7#M%/0.%.#?#20"%$-70N"#%"'M'707'-$.C%%
1'2.7% 0$3% F-2#M-.7+% X'P96V'E'% '.%"'M'7#3%N6%7/#%S(0"'76%-F%
3070%'$%V'E'M#3'0%W-MM-$.C%O#.A'7#%N#'$I%7/#%"02I#.7%2#A-.'Q
7-26%-F%'7.%76A#+%7/#%W-MM-$.% .(FF#2.%F2-M%7/#%.0M#%M#703070%
.70$3023'P07'-$%'..(#.%7/07%02#%;-MM-$%7-%0""%A# #2QA2-3 (;7'-$%
.6.7#M.%_haC%1-27($07#"6+% ='7/% 7/#% 2#;#$7"6% 0$$-($;#3%M("7'Q
6#02% A2-c#;7% 0'M'$I% 7-% .70$30 23'P#% 3070% '$% V'E'M#3'0% W-MQ
M-$.% _LKa+% ;/0$;#.% 0 2#% 7/07% M#703070% S(0"'76% ='""% 'MA2-?#+%
M0E'$I%X'P96V'E'%M-2#%#FF#;7'?# %'$%7/#%F(7(2#C%%
g#;-$3+%X'P96V'E'% .-M#7'M#.% 2#;-MM#$3.% ?'.(0"'P07'-$.%
='7/% -"3#2% 3070C% b#"07#3"6+% '7%0".-% .-M#7'M#.% F0'".% 7-% 2#72'# ?#%
3070% ?'.(0"'P07'-$.% F-2% $#=.% 7/07% /0.% ?#26% 2#;#$7"6% N2-E#$C%
T/#2#%02#%7=-%A-7#$7'0"%;0(.#.%/#2#C%1'2.7+%X'P96V'E'%'.%(.'$I%0%
.707';%.$0A./-7%-F%7/#%W-MM-$.e%'M0I#.+%0$3%7/'.%M06%/0?#%2#Q
.("7#3%'$%(.%.#2?'$I%-"3#2%?#2.'-$.%-F%?'.(0"'P07'-$.%7/0$%;(2Q
2#$7"6%#R'.7%'$%7/#%W-MM-$.C%T/'.%A2-N"#M%;-("3%N#%0332#..#3%
='7/%0%"02I#2Q.;0"#% 3#A"-6M#$7%(.'$I%2#0"%7'M#% W-MM-$.%0$3%
V'E'A#3'0%3070C%g#;-$3+%'7%M06%N#%7/07%7/#2#%'.%0%"0 I%N#7=##$%0$%
#?#$7%-;;(22'$I%0$3%V'E'A#3'0% #3'7-2.% (A307'$I%7/#'2%?'.(0"'Q
P07'-$.%7-%'$;"(3#%7/#%$#=%3070C%1(7(2#%=-2E%./-("3%#R0 M'$#%
7/#%"0I%7'M#%F-2%?'.(0"'P07'-$%I#$#207'-$+%0.%/0.%N##$% 3-$#%F-2%
7#R7%D =/';/%F-($3%"0I%7'M#%N#%2#"07'?#"6%.M0""%_*\aHC%
T/'23+%=#%-N.#2?#3%7/07%X'P96V'E'% =-2E.% N#77#2% -$% .-M#%
027';"#.% 7/0$% -$% -7/#2.C% 1(7(2#% =-2E% ./-("3% .##E% 7-% 0$.=#2%
S(#.7'-$.% .(;/%0.U% V/07% ;/020;7#2'.7';.% M0E#% 0% $#=.% 027';"#%
.('70N"#%F-2%3070%?'.(0"'P07'-$.s%V/07%'.%7/#%0?0'"0N'"'76%-F%3'FQ
F#2#$7%76A#.%0$3%3'FF#2#$7%7-A';.%3070%?'.(0"'P07'-$.%-$%V'E'Q
M#3'0%W-MM-$.%0$3%-$%7/#%V#N%M-2#%I#$#20""6s%%
1'$0""6+%;2-=3.-(2;#3%?'.(0"'P07'-$.%M'I/7%$-7%;-$F-2M%7-%
.A#;'F';% 0#.7/#7';% 2#S('2#M#$7.% F2-M% A(N" './#2.C% :-=#?#2+%
7/#2# %'.%0%A2-M'.'$I%-AA-27($'76%7-%;/0'$%X'P96V'E '%='7/%-7/#2%
A'A#"'$#.% 7/07% 2#?#2.#Q#$I'$##2% 3070% ?'.(0"'P07'-$.% D#CIC+% _*L+%
4j+%8\aH%7-%.(AA-27%0% ;-MA"#7#% A2-;#..% -F%F'$3'$I%3070%?'.(0"'Q
P07'-$+%#R 720;7'$I%3070%0$3%2#3#.'I$'$I%7/#%I20A/';.C%%
7 CONCLUSION
T-%0332#..%7/#% ;/0""#$I#%-F%0(7-M07';0""6%I#$#207'$I%"02I#%
$(MN#2.%-F%3070%?'.(0"'P 07 '-$.%F-2%$#=.%027';"#.+%7/'.%A0A# 2%3#Q
F'$#3% 7/#%news!d ata!visualization!retrieval%problem,!=/';/%
'$?-"?#.%M'$'$I%3070%?'.(0"'P07'-$.%F2-M% 7/#% =#N% 7-% #$/0$;#%
$#=.%027';"#.C%V#%./-=#3%7/07%7/'.%A2-N"#M%=0.%720;70N"#%N6%
3#.'I$'$I%0$% end-to-end! system,! VizByWiki+% =/';/%M'$#.%0%
A-=#2F("Q6#7Q($70AA#3% ;-2 A(.% -F% 3070% ?'.(0"'P 07'-$.U% V'E'Q
M#3'0% W-MM-$.C% V#% #?0"(07#3% X'P96V'E'% (.'$I% A-A("02%
-$"'$#%$#=.%027';"#.%-F%3'FF#2#$7%76A#.C%V#%F-($3%7/07%7/#%.6.Q
7#M%;-("3%2#72'#?#%(.#F("%?'.(0"'P07'-$.%F-2%M0$6%A-A("02%027'Q
;"#.%0$3%7/07%'7%;-("3%0;/'#?#%.07'.F6'$I%20$E'$I%S(0"'76C%T-%F0Q
;'"'707#%F(27/#2%A2-I2#..%-$%7/#%$#=.%3070%?'.(0"'P07'-$%.6.7#M+%
=#%02#%2#"#0.'$I%0% 3#M-% -F%7/#%.6.7#M+%-(2%I2-($3%72(7/%3070+%
0$3%-(2%.-(2;#%;-3#%D.##%>b)%-$% AC%4HC%
ACKNOWLEDGEMENTS
T/'.%=-2E% =0.%F($3#3%'$%A027% N6%7/#% >CgC%<07'-$0"%g;'#$;#%
1-($307'-$%DBBgQ*jK4LLK+%BBgQ*jKj8*h+% W!b55b%BBgQ*jKj4h`+%
0$3%BBgQ*L4*L8YHC%
Table 4. The performance of the supervised ranker using dif-
ferent features and compared to the baseline method in [1].
Features
nDCG@3
nDCG@5
nDCG@7
baseline [1]
0.69 (0.30)
0.74 (0.25)
0.78 (0.21)
all textual features
0.77 (0.32)
0.82 (0.23)
0.84 (0.19)
all textual features
all visual features
0.69 (0.31)
0.75 (0.25)
0.78 (0.22)
content-caption
title-caption
0.69 (0.29)
0.75 (0.24)
0.79 (0.21)
content-WPtitle
title-caption
0.73 (0.26)
0.81 (0.19)
0.83 (0.17)
content-WPtitle
content-caption
0.79 (0.26)
0.82 (0.17)
0.85 (0.19)
Note: The nDCG is the mean nDCG across all queries (i.e. news arti-
cles). Standard deviations are in brackets.
REFERENCES
[1] Agrawal, R. et al. 2011. Enriching textbooks with images. Proceedings
of the 20th ACM international conference on Information and
knowledge management (2011), 18471856.
[2] Baack, S. 2011. A new style of news reporting: Wikileaks and data-
driven journalism. Cyborg Subjects. (2011), 10.
[3] Chang, S. et al. 2015. Got Many Labels?: Deriving Topic Labels from
Multiple Sources for Social Media Posts Using Crowdsourcing and En-
semble Learning. Proceedings of the 24th International Conference on
World Wide Web (New York, NY, USA, 2015), 397406.
[4] Cheng, X. and Roth, D. 2013. Relational inference for wikification. Ur-
bana. 51, 61801 (2013), 1658.
[5] Delgado, D. et al. 2010. Assisted News Reading with Automated Illus-
tration. Proceedings of the 18th ACM International Conference on Mul-
timedia (New York, NY, USA, 2010), 16471650.
[6] Ensan, F. and Bagheri, E. 2017. Document Retrieval Model Through
Semantic Linking. Proceedings of the Tenth ACM International Con-
ference on Web Search and Data Mining (New York, NY, USA, 2017),
181190.
[7] Gabrilovich, E. and Markovitch, S. 2007. Computing semantic related-
ness using Wikipedia-based explicit semantic analysis. IJCAI (2007),
16061611.
[8] Gao, T. et al. 2014. NewsViews: an automated pipeline for creating cus-
tom geovisualizations for news. (2014), 30053014.
[9] Hall, A. et al. 2017. Freedom Versus Standardization: Structured Data
Generation in a Peer Production Community. Proceedings of the 2017
CHI Conference on Human Factors in Computing Systems (New York,
NY, USA, 2017), 63526362.
[10] Howard, A.B. 2014. The Art and Science of Data-Driven Journalism.
(2014).
[11] Hullman, J. et al. 2013. Contextifier: Automatic Generation of Anno-
tated Stock Visualizations. Proceedings of the SIGCHI Conference on
Human Factors in Computing Systems (New York, NY, USA, 2013),
27072716.
[12] Ipeirotis, P.G. et al. 2010. Quality Management on Amazon Mechanical
Turk. Proceedings of the ACM SIGKDD Workshop on Human Compu-
tation (New York, NY, USA, 2010), 6467.
[13] Joachims, T. 2002. Optimizing Search Engines Using Clickthrough
Data. Proceedings of the Eighth ACM SIGKDD International Confer-
ence on Knowledge Discovery and Data Mining (New York, NY, USA,
2002), 133142.
[14] Jung, D. et al. 2017. ChartSense: Interactive Data Extraction from Chart
Images. Proceedings of the 2017 CHI Conference on Human Factors
in Computing Systems (New York, NY, USA, 2017), 67066717.
[15] Keegan, B. et al. 2013. Hot Off the Wiki: Structures and Dynamics of
Wikipedia’s Coverage of Breaking News Events. American Behavioral
Scientist. 57, 5 (May 2013), 595622.
DOI:https://doi.org/10.1177/0002764212469367.
[16] Kittur, A. et al. 2008. Crowdsourcing User Studies with Mechanical
Turk. Proceedings of the SIGCHI Conference on Human Factors in
Computing Systems (New York, NY, USA, 2008), 453456.
[17] Li, W. and Zhuge, H. 2014. Summarising news with texts and pictures.
Semantics, Knowledge and Grids (SKG), 2014 10th International Con-
ference on (2014), 100107.
[18] Li, Z. et al. 2016. Multimedia News Summarization in Search. ACM
Trans. Intell. Syst. Technol. 7, 3 (Feb. 2016), 33:133:20.
DOI:https://doi.org/10.1145/2822907.
[19] Li, Z. et al. 2011. News contextualization with geographic and visual
information. Proceedings of the 19th ACM international conference on
Multimedia (2011), 133142.
[20] Li, Z. 2017. Understanding-Oriented Multimedia News Summariza-
tion. Understanding-Oriented Multimedia Content Analysis. Springer,
Singapore. 131153.
[21] Lin, Y. et al. 2017. Problematizing and Addressing the Article-as-Con-
cept Assumption in Wikipedia. Proceedings of the 2017 ACM Confer-
ence on Computer Supported Cooperative Work and Social Computing
(New York, NY, USA, 2017), 20522067.
[22] Mackinlay, J. 1986. Automating the Design of Graphical Presentations
of Relational Information. ACM Trans. Graph. 5, 2 (Apr. 1986), 110
141.
DOI:https://doi.org/10.1145/22949.22950.
[23] Marcus, A. et al. 2013. Data In Context: Aiding News Consumers while
Taming Dataspaces. DBCrowd 2013. 47, (2013).
[24] Mihalcea, R. and Csomai, A. 2007. Wikify!: Linking Documents to En-
cyclopedic Knowledge. Proceedings of the Sixteenth ACM Conference
on Conference on Information and Knowledge Management (New
York, NY, USA, 2007), 233242.
[25] Noraset, T. et al. 2014. WebSAIL Wikifier at ERD 2014. Proceedings
of the First International Workshop on Entity Recognition & Disam-
biguation (New York, NY, USA, 2014), 119124.
[26] Parasie, S. and Dagiral, E. 2013. Data-driven journalism and the public
good:“Computer-assisted-reporters” and “programmer-journalists” in
Chicago. New media & society. 15, 6 (2013), 853871.
[27] Poco, J. and Heer, J. 2017. Reverse-Engineering Visualizations: Recov-
ering Visual Encodings from Chart Images. Computer Graphics Fo-
rum. 36, 3 (Jun. 2017), 353363.
DOI:https://doi.org/10.1111/cgf.13193.
[28] Ramisa, A. et al. 2016. Breakingnews: Article annotation by image and
text processing. arXiv preprint arXiv:1603.07141. (2016).
[29] Razavian, A.S. et al. 2014. CNN Features off-the-shelf: an Astounding
Baseline for Recognition. arXiv:1403.6382 [cs]. (Mar. 2014).
[30] Ren, D. et al. 2014. iVisDesigner: Expressive Interactive Design of In-
formation Visualizations. IEEE Transactions on Visualization and
Computer Graphics. 20, 12 (Dec. 2014), 20922101.
DOI:https://doi.org/10.1109/TVCG.2014.2346291.
[31] Satyanarayan, A. and Heer, J. 2014. Lyra: An Interactive Visualization
Design Environment. Computer Graphics Forum. 33, 3 (Jun. 2014),
351360. DOI:https://doi.org/10.1111/cgf.12391.
[32] Savva, M. et al. 2011. Revision: Automated classification, analysis and
redesign of chart images. Proceedings of the 24th annual ACM sympo-
sium on User interface software and technology (2011), 393402.
[33] Segel, E. and Heer, J. 2010. Narrative visualization: Telling stories with
data. IEEE transactions on visualization and computer graphics. 16, 6
(2010), 11391148.
[34] Sen, S. et al. 2014. WikiBrain: Democratizing Computation on Wikipe-
dia. Proceedings of The International Symposium on Open Collabora-
tion (New York, NY, USA, 2014), 27:127:10.
[35] Siegel, N. et al. 2016. FigureSeer: Parsing Result-Figures in Research
Papers. Computer Vision ECCV 2016 (Oct. 2016), 664680.
[36] Szegedy, C. et al. 2016. Rethinking the Inception Architecture for Com-
puter Vision. (2016), 28182826.
[37] Tsikrika, T. et al. 2011. Overview of the Wikipedia Image Retrieval
Task at ImageCLEF 2011. CLEF (Notebook Papers/Labs/Workshop)
(2011).
[38] Wang, P. et al. 2017. Concept Embedded Convolutional Semantic
Model for Question Retrieval. Proceedings of the Tenth ACM Interna-
tional Conference on Web Search and Data Mining (New York, NY,
USA, 2017), 395403.
[39] WikiBrain: Advanced SR usage.: https://shilad.github.io/wikibrain/tu-
torial/advancedsr.html. Accessed: 2017-10-31.
[40] Wikimedia Foundation 2017. Wikimedia Foundation receives $3 mil-
lion grant from Alfred P. Sloan Foundation to make freely licensed im-
ages accessible and reusable across the web. Retrieved from
https://blog.wikimedia.org/2017/01/09/sloan-foundation-structured-
data/
[41] Wongsuphasawat, K. et al. 2016. Voyager: Exploratory Analysis via
Faceted Browsing of Visualization Recommendations. IEEE Transac-
tions on Visualization and Computer Graphics. 22, 1 (Jan. 2016), 649
658. DOI:https://doi.org/10.1109/TVCG.2015.2467191.
[42] 2017. Help:Adding image. Wikipedia. Retrieved from https://en.wik-
ipedia.org/w/index.php?title=Help:Adding_image&oldid=764170156