#!/bin/sh
#set work directory
export WORK=.

#######################################################################
# retrieve change data
#######################################################################
cd $WORK
#first get the source code 
export CVSROOT=:pserver:anonymous@cvs-mirror.mozilla.org:/cvsroot
cvs login
#passwd: anonymous
#following takes more than 24 hours
cvs co mozilla

#get changes
cd mozilla
for i in mozilla/*; do j=$(echo $i| sed 's"mozilla/""'); cvs log $i | gzip > $WORK/$j.log.gz
cd ..

#extract all individual delta
for i in *.log.gz; do j=$(echo $i|sed s/.log.gz$//); echo $j; perl extr.perl $i | gzip > $j.log1.gz; done

#group changes into MRs based on login comment and  three minute period
for i in *.log1.gz; do gunzip < $i; done | perl group.perl 180 | gzip > allmrs.gz

####################################################################
# retrieve bug data
# it would be better to have direct access to mysql 
# that stores bug bugzilla data, but following can do 
# it purely using web access
###################################################################
#results will be in details.gz, status.gz, and crdate.gz
getbugs.perl
getbugnumbers.perl | sort -nu > bugnumbers
#following takes a about a week, better not overwrite
#can be speeded up by parallelising, but that might have impact on the
#mozilla's web server, let's minimize that.
#./getbugdetails.perl print < bugnumbers 2> bugdetail1.err > bugdetail1.gz &
#./getbugdetails.perl status < bugnumbers 2> bugdetail2.err > bugdetail2.gz &
#./getbugdetails.perl html < bugnumbers > bugdetailhtml.err > bugdetail1html.gz < /dev/null &

gunzip < bugdetail1.gz | ./procbugdetails.perl 2> details.err | gzip > details.gz
gunzip < bugdetail2.gz | ./procbugdetails2.perl | gzip > status.gz
gunzip < bugdetail1html.gz | ./procbugdetails3.perl 2> crdate.err | gzip > crdate.gz

#check if all have been retrieved
gunzip < details.gz | cut -d\; -f3 | sort -u > a
sort -u bugnumbers > b
join -v 2 a b > remainingdetail1  
gunzip < status.gz | gawk -F\= '{print $NF;}' | sort -u > c
join -v 2 c b > remainingdetail2  
gunzip < crdate.gz | cut -d\; -f2 | sort -u > d
join -v 2 d b > remainingdetail3  

./getbugdetails.perl print < remainingdetail1  2> bugdetail11.err | gzip > bugdetail11.gz &
./getbugdetails.perl status < remainingdetail2  2> bugdetail21.err | gzip > bugdetail21.gz &
./getbugdetails.perl html < remainingdetail3  2> bugdetail31.err | gzip > bugdetail31.gz &

gzcat bugdetail1.gz bugdetail11.gz | ./procbugdetails.perl 2> details.err | gzip > details.gz
gzcat bugdetail2.gz bugdetail21.gz | ./procbugdetails2.perl | gzip > status.gz
gzcat bugdetail1html.gz bugdetail31.gz | ./procbugdetails3.perl | gzip > crdate.gz

#now it looks OK
gunzip < details.gz | cut -d\; -f3 | sort -u > a
join -v 2 a1 b
gunzip < status.gz | gawk -F\= '{print $NF;}' | sort -u > c1
join -v 2 c1 b > remainingdetail2  
gunzip < crdate.gz | cut -d\; -f2 | sort -u > d1
join -v 2 d1 b > remainingdetail2 

#######################################################################
#now start processing
#######################################################################
#based on 
#deltadata in *log1.gz
#mr data in allmrs.gz
#bug data in details.gz, status.gz, and crdate.gz

#collect all email, names and try to have a comprehensive identification
# of developers
for i in *.log1.gz; do gunzip < $i; done | cut -d\; -f7 > /tmp/a
perl -ane '$_=~s/\%/\@/;$_=~tr/[A-Z]/[a-z]/;if (/\@/){print $_;}else{chop($_);print "$_"."\@"."netscape.com\n";}' < /tmp/a | sort -u > login0
for i in *.log1.gz; do gunzip < $i; done | cut -d\; -f12 | perl findemail.perl| sort -u > login1
gunzip < details.gz | cut -d\; -f2,11,12 | gawk -F\; '{print $1; print $2; print $3;}' | tr '[A-Z]' '[a-z]' | sort -u > login2
gunzip < status.gz | perl findemail.perl| sort -u > login3

# now that we have all logins -> match them with actual names
cat login? | sort -u | grep '@' > logins

#first see if prefix includes .  
grep '\..*@' logins | wc
#655
grep '\..*@' logins | cut -d\@ -f1 | sort -u | wc
#646
#(ignore 11 duplicates) assume email is the id


###################################
#map directories into projects
#2D graphics: layout 
#widget interfaces for HTML and XUL: widget
#Clipping and Compositing: view
#mail/news: mailnews,xpfe,rdf,xpcom
#editor: editor
#javascript: js
#NetLib: netwerk
#Netscape Portable Runtime: nsprpub
#chatzilla: extensions/irc
#calendar: calendar
#ldap - directory
#dialup - cmd/dialup
#dom - dom,java/dom
#JIT - ef
#embeddable br. - embedding, webshell, java/weblient
#gtk -  gfx/src/gtk, widget/src/gtk, widget/timer/src/unix/gtk, embedding/browser/gtk
#html2ps - gfx/src/ps
#capabilities classes: caps
 
#get all the files in the repository:
#Do first on linux to keep full names
tar tzf  ~/h/work/m1/mozilla-19990128.tar.gz  | grep -wv CVS | sort -u > client.files 

tar tzf  ~/h/work/m1/mozilla-source.tar.gz   | grep -wv CVS | sort -u > clientnew.files 
bunzip2 < ~/h/work/m1/mozilla-source-M18.tar.bz2 |tar tf -|  grep -wv CVS | sort -u > clientnew1.files 

#sort is different on irix
sort client.files | grep -v '/$' > client.files1
sort clientnew1.files | grep -v '/$' > clientnew1.files1
for i in *.log1.gz; do gunzip < $i; done | gawk -F\; '{ print $1 }' | sort -u > cvs.files

#looks good
join -v 2 cvs.files client.files1
#mozilla/nglayout.mak
join -v 2 cvs.files clientnew1.files1
#mozilla/.cvsignore
#mozilla/.mozconfig.mk
#mozilla/.mozconfig.out

#summarize all MRs
perl getclientmr.perl clientnew1.files1 > all.mr1
 
#############################
#Now count delta and lines added 
#############################
#Over time
#R --vsize=100M --nsize=5000k
source ("script.S");
x_scan("all.mr1", what=list(mr=0,time=0,ndelta=0,nadd=0,ndel=0,nameold="",bugs="",isPr=0,isClient=0,mods="",files="",name=""),sep=";");
x$isClient1_rep(F, length(x$name));x$isClient1[x$isClient==1]_T;
x$inside_rep(F, length(x$name));x$inside[grep("netscape.com",x$name)]_T;
x$inside[grep("mozilla.org",x$name)]_T;
x$inside[grep("cls@seawoo",x$nameold)]_T;
x$insideold_rep(F, length(x$nameold));x$insideold[grep("netscape.com",x$nameold)]_T;
x$insideold[grep("mozilla.org",x$nameold)]_T;
x$insideold[grep("cls@seawoo",x$nameold)]_T;

#################
##make trend and code ownership plots
#################
postscript("exttrend.ps",paper="special",width=12, height=9,horizontal=F);
ind _ rep(T, length(x$name));
nlog_table(x$name[ind], x$insideold[ind],floor(x$time[ind]/3600/24/364.25*12)/12);
nlog[nlog>0]_1;
nlog_apply(nlog,c(2,3),sum);
nmr_table(x$insideold[ind],floor(x$time[ind]/3600/24/364.25*12)/12);
ndelta_tapply(x$ndelta[ind],list(x$insideold[ind],floor(x$time[ind]/3600/24/364.25*12)/12),sum);
nadd_tapply(x$nadd[ind],list(x$insideold[ind],floor(x$time[ind]/3600/24/364.25*12)/12),sum);
nlog[is.na(nlog)]_0;nmr[is.na(nmr)]_0;ndelta[is.na(ndelta)]_0;nadd[is.na(nadd)]_0;
plotTrend (nlog[1,]/apply(nlog,2,sum),nmr[1,]/apply(nmr,2,sum), ndelta[1,]/apply(ndelta,2,sum),nadd[1,]/apply(nadd,2,sum), c("Fraction of external logins per month","Fraction of external MRs per month","Fraction of external deltas per month", "Fraction of external lines added per month"),"");

#whats going on with line add peaks?
ind _ floor(floor(x$time/3600/24/364.25*12)/12*10)==299 & !x$insideold
sort(tapply(x$nadd[ind],x$name[ind],sum))
#Mozilla guys!

ind _ floor(floor(x$time/3600/24/364.25*12)/12*10)==296 & !x$insideold
ind1 _ ind & x$nameold=="cls@seawood.org"
sort(tapply(x$nadd[ind],x$nameold[ind],sum))

ind _ floor(floor(x$time/3600/24/364.25*12)/12*10)==306 & !x$insideold
ind1 _ ind & x$nameold=="cls@seawood.org"
sort(tapply(x$nadd[ind],x$nameold[ind],sum))
#tim rowli from brown an somebody from ibm

postscript("all.ps",paper="letter");
par(mfrow=c(2,2));
ind _ x$isClient1;i _ "Client";
#do code inside loop
for (i in c(names(sort(-table(x$mods)))[1:40])){
ind _ rep(F, length(x$name));ind[grep(i,x$mods)]_T; 
nmr_table(x$inside[ind],floor(x$time[ind]/3600/24/364.25*12)/12);
ndelta_tapply(x$ndelta[ind],list(x$inside[ind],floor(x$time[ind]/3600/24/364.25*12)/12),sum);
nadd_tapply(x$nadd[ind],list(x$inside[ind],floor(x$time[ind]/3600/24/364.25*12)/12),sum);
nmr[is.na(nmr)]_0;ndelta[is.na(ndelta)]_0;nadd[is.na(nadd)]_0;
plotTrend (apply(nmr,2,sum), apply(ndelta,2,sum)/5,apply(nadd,2,sum)/1000, c("Number of MRs per month","Number of delta/5 per month", "Number of lines added/1000 per month"),i);
plotTrend (nmr[2,]/apply(nmr,2,sum), ndelta[2,]/apply(ndelta,2,sum),nadd[2,]/apply(nadd,2,sum), c("Fractio of internal MRs per month","Fraction of internal delta per month", "Fraction of internal lines added per month"),i);
nn_ names(sort(-table(x$name[ind])));
o0_length(nn);
tmp_plotCode(ind,i);
tmp1_plotCode(x$isPr==1&ind,i);
}

mods _ c("/layout","/js","/rdf","/netwerk","/editor","/intl","/xpinstall")
postscript("mozcum.ps",paper="special",width=12, height=9,horizontal=F);
plotCode1(mods);



#################
#get size, productivity, and defect density stats
#################
ind _ rep(T,length(x$name));i _ "*";
nn_ names(sort(-table(x$name[ind])));o0_length(nn);tmp_plotCode(ind,"");
report(ind,i,tmp);
for (i in names(sort(-table(x$mods)))[1:40]){
ind _ rep(F, length(x$name));ind[grep(i,x$mods)]_T;
nn_ names(sort(-table(x$name[ind])));o0_length(nn);
tmp_plotCode(ind,i);
dimnames(tmp)_list(nn,c("fr","mr","d","add"));
report(ind,i,tmp);
}


#################
#Count defect density
#################
perl defectDensity.perl < all.mr > defect.density
perl defectDensity.perl < all.mr1 > defect.density1
tail -58 defect.density1 | gawk -F\; 'BEGIN {printf "Module           NMR\tbug/KMR  bug/KDel  bug/MLOC  NFixers NCoders NTot \%FixInside  \%CodersInside \%FixMrInside  \%CodeMrInside\n";} {printf ("%18s %7d\t%7d\t%7d\t%7d\t%7d\t%7d\t%7d %8d %8d %8d %8d\n", $1, $2, $5*1000,$6*1000,$7*1000000,$8,$9,$10,$11*100,$12*100,$13*100,$14*100);}' | a2ps -r --columns=1 --font=9 -o defect.density.ps --stdin="Defect density" -

#look at participation rates
perl defectReporters.perl < all.mr1 |sort -t\; +0 +1 | gawk -F\; 'BEGIN {printf "Type\t\t\tName\t\t\tNReporters pctInside NBugs pctInside\n";} {printf ("%9s %30s\t%7d\t%7d\t%7d\t%7d\n", $1, $2, $3, $4*100, $5, $6*100);}' | a2ps -r --columns=1 --font=10 -o defect.reports.ps --stdin="Defect reporters" -

#look at resolution interval
perl defectInterval.perl < all.mr > interval
perl defectInterval.perl < all.mr1 > interval1
#datef - fixed, dater - resolved, datev - verified
R stuff below
source("script.S");
y_scan("interval1", what=list(bug=0,pri="",o=0,ndates=0,date0=0,date1=0,c=0,nm=0,np=0,nc=0,mod1="",prod="",comp="",modc="",datef=0,dater=0,datev=0,isCh=0),sep=";");
y$isClient_rep(F,length(y$bug));y$isClient[grep("IsClient", y$modc)]_T;
y$nchanges_y$isCh;y$isCh[y$isCh>0]_1;

table(!is.na(y$datef[!is.na(y$o+y$c)]),!is.na(y$dater[!is.na(y$o+y$c)]),!is.na(y$datev[!is.na(y$o+y$c)]))
table(list(f=!is.na(y$datef[!is.na(y$o+y$c)]),rr=!is.na(y$dater[!is.na(y$o+y$c)]),v=!is.na(y$datev[!is.na(y$o+y$c)])))
table(is.na(y$datef[!is.na(y$o+y$c)]))
table(is.na(y$dater[!is.na(y$o+y$c)]))
table(is.na(y$datev[!is.na(y$o+y$c)]))


#What are fixed bugs that do not have trace in CVS?
aa_y$bug[!is.na(y$datef)&y$isCh==0&y$datef/3600/24/365.25+70>100.7]

#See reporting trends
tmp_table((floor(y$o/3600/24/365.25*12)/12+1970)[!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999]);
tmp1_table((floor(y$o/3600/24/365.25*12)/12+1970)[y$isClient&!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999]);
tmp2_table((floor(y$o/3600/24/365.25*12)/12+1970)[y$isCh>0&!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999]);
matplot(as.numeric(names(tmp))[-length(tmp)], cbind(tmp[-length(tmp)],tmp2[-length(tmp)],tmp1[-length(tmp)]),type="l",
ylim=c(0,max(tmp)));

plot(as.numeric(names(tmp))[-length(tmp)], tmp2[-length(tmp)]/tmp[-length(tmp)],type="l",
ylim=c(0,max(tmp2/tmp)));
mean(y$isCh[!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999])
#0.1838688

#divide interval into components (verification and such)
ind _ !is.na(y$datev)&!is.na(y$dater)&!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999.5
quantile(y$datev[ind]-y$dater[ind], 1:9/10)/3600/24
#0.007511574  0.145061343  0.816497685  2.028943287  4.830335648
#9.226901620 19.720754630 41.306120370 95.470750000
quantile(y$dater[ind]-y$o[ind], 1:9/10)/3600/24
#0.05114468  0.23068403  0.88466667  2.42005440  5.82220486 11.58149537
#22.06760995 41.99782407 84.05654167
 
#half of all PRs that visit status Verified and Resolved take 
#43 or less percent of the time between resolved and verified states
quantile((y$datev[ind]-y$dater[ind])/(y$datev[ind]-y$o[ind]),  1:9/10)
#0.00354502 0.04012123 0.12443593 0.26232221 0.43977956 0.64768240 0.81740503
#0.93207670 0.98749412


#
postscript("BugInterval.ps",paper="special",width=12, height=9,horizontal=F);
par(mfrow=c(2,2));
prepos_rep("before 2000", length(y$o));
prepos[y$o/3600/24/365.25+1970>2000]_"after 2000";
mods _ c("/layout","/js","/rdf","/netwerk","/editor","/intl","/xpinstall")
plotBugs1(y, !is.na(y$dater)&!is.na(y$o+y$c),prepos,"", mods);

postscript("BugInterval2.ps",paper="letter");
par(mfrow=c(2,2));
plotBugs(!is.na(y$dater)&!is.na(y$o+y$c)&y$o/3600/24/365.25+1970<2000,"Pre 2000");

plotBugs(!is.na(y$dater)&!is.na(y$o+y$c),"All");

#the rest not that interesting
for (i in names(sort(-table(y$comp)))[1:10]){
  plotBugs(!is.na(y$dater)&!is.na(y$o+y$c)&y$comp==i,paste("Component", i));
}
for (i in names(sort(-table(y$prod)))[1:7]){
  plotBugs(!is.na(y$dater)&!is.na(y$o+y$c)&y$prod==i,paste("Product", i));
}

table((y$o/3600/24/365.25+1970<2000)[!is.na(y$dater)&!is.na(y$o+y$c)])

table(y$isCh>0,y$isClient);
#      0    1 
#0 47403    0
#1  6415 4149

table(y$isCh>0);
#     0     1 
# 47403 10564

table(y$pri, y$isCh>0)[,2]/(table(y$pri, y$isCh>0)[,1]+table(y$pri, y$isCh>0)[,2]);
#      P1        P2        P3        P4        P5 
# 0.3875442 0.2794606 0.1570678 0.2084691 0.1215278

tmp_table(y$pri[y$isCh>0], y$isClient[y$isCh>0]);
tmp[,2]/(tmp[,1]+tmp[,2]);
#   P1       P2        P3        P4        P5 
#0.4266667 0.481069 0.3720503 0.3046875 0.4571429

table(y$pri);
#   P1   P2    P3  P4  P5 
# 3677 4820 48565 614 288
sort(table(y$mod1));
sort(table(y$prod));
sort(table(y$comp));

