#!/bin/sh #set work directory export WORK=. ####################################################################### # retrieve change data ####################################################################### cd $WORK #first get the source code export CVSROOT=:pserver:anonymous@cvs-mirror.mozilla.org:/cvsroot cvs login #passwd: anonymous #following takes more than 24 hours cvs co mozilla #get changes cd mozilla for i in mozilla/*; do j=$(echo $i| sed 's"mozilla/""'); cvs log $i | gzip > $WORK/$j.log.gz cd .. #extract all individual delta for i in *.log.gz; do j=$(echo $i|sed s/.log.gz$//); echo $j; perl extr.perl $i | gzip > $j.log1.gz; done #group changes into MRs based on login comment and three minute period for i in *.log1.gz; do gunzip < $i; done | perl group.perl 180 | gzip > allmrs.gz #################################################################### # retrieve bug data # it would be better to have direct access to mysql # that stores bug bugzilla data, but following can do # it purely using web access ################################################################### #results will be in details.gz, status.gz, and crdate.gz getbugs.perl getbugnumbers.perl | sort -nu > bugnumbers #following takes a about a week, better not overwrite #can be speeded up by parallelising, but that might have impact on the #mozilla's web server, let's minimize that. #./getbugdetails.perl print < bugnumbers 2> bugdetail1.err > bugdetail1.gz & #./getbugdetails.perl status < bugnumbers 2> bugdetail2.err > bugdetail2.gz & #./getbugdetails.perl html < bugnumbers > bugdetailhtml.err > bugdetail1html.gz < /dev/null & gunzip < bugdetail1.gz | ./procbugdetails.perl 2> details.err | gzip > details.gz gunzip < bugdetail2.gz | ./procbugdetails2.perl | gzip > status.gz gunzip < bugdetail1html.gz | ./procbugdetails3.perl 2> crdate.err | gzip > crdate.gz #check if all have been retrieved gunzip < details.gz | cut -d\; -f3 | sort -u > a sort -u bugnumbers > b join -v 2 a b > remainingdetail1 gunzip < status.gz | gawk -F\= '{print $NF;}' | sort -u > c join -v 2 c b > remainingdetail2 gunzip < crdate.gz | cut -d\; -f2 | sort -u > d join -v 2 d b > remainingdetail3 ./getbugdetails.perl print < remainingdetail1 2> bugdetail11.err | gzip > bugdetail11.gz & ./getbugdetails.perl status < remainingdetail2 2> bugdetail21.err | gzip > bugdetail21.gz & ./getbugdetails.perl html < remainingdetail3 2> bugdetail31.err | gzip > bugdetail31.gz & gzcat bugdetail1.gz bugdetail11.gz | ./procbugdetails.perl 2> details.err | gzip > details.gz gzcat bugdetail2.gz bugdetail21.gz | ./procbugdetails2.perl | gzip > status.gz gzcat bugdetail1html.gz bugdetail31.gz | ./procbugdetails3.perl | gzip > crdate.gz #now it looks OK gunzip < details.gz | cut -d\; -f3 | sort -u > a join -v 2 a1 b gunzip < status.gz | gawk -F\= '{print $NF;}' | sort -u > c1 join -v 2 c1 b > remainingdetail2 gunzip < crdate.gz | cut -d\; -f2 | sort -u > d1 join -v 2 d1 b > remainingdetail2 ####################################################################### #now start processing ####################################################################### #based on #deltadata in *log1.gz #mr data in allmrs.gz #bug data in details.gz, status.gz, and crdate.gz #collect all email, names and try to have a comprehensive identification # of developers for i in *.log1.gz; do gunzip < $i; done | cut -d\; -f7 > /tmp/a perl -ane '$_=~s/\%/\@/;$_=~tr/[A-Z]/[a-z]/;if (/\@/){print $_;}else{chop($_);print "$_"."\@"."netscape.com\n";}' < /tmp/a | sort -u > login0 for i in *.log1.gz; do gunzip < $i; done | cut -d\; -f12 | perl findemail.perl| sort -u > login1 gunzip < details.gz | cut -d\; -f2,11,12 | gawk -F\; '{print $1; print $2; print $3;}' | tr '[A-Z]' '[a-z]' | sort -u > login2 gunzip < status.gz | perl findemail.perl| sort -u > login3 # now that we have all logins -> match them with actual names cat login? | sort -u | grep '@' > logins #first see if prefix includes . grep '\..*@' logins | wc #655 grep '\..*@' logins | cut -d\@ -f1 | sort -u | wc #646 #(ignore 11 duplicates) assume email is the id ################################### #map directories into projects #2D graphics: layout #widget interfaces for HTML and XUL: widget #Clipping and Compositing: view #mail/news: mailnews,xpfe,rdf,xpcom #editor: editor #javascript: js #NetLib: netwerk #Netscape Portable Runtime: nsprpub #chatzilla: extensions/irc #calendar: calendar #ldap - directory #dialup - cmd/dialup #dom - dom,java/dom #JIT - ef #embeddable br. - embedding, webshell, java/weblient #gtk - gfx/src/gtk, widget/src/gtk, widget/timer/src/unix/gtk, embedding/browser/gtk #html2ps - gfx/src/ps #capabilities classes: caps #get all the files in the repository: #Do first on linux to keep full names tar tzf ~/h/work/m1/mozilla-19990128.tar.gz | grep -wv CVS | sort -u > client.files tar tzf ~/h/work/m1/mozilla-source.tar.gz | grep -wv CVS | sort -u > clientnew.files bunzip2 < ~/h/work/m1/mozilla-source-M18.tar.bz2 |tar tf -| grep -wv CVS | sort -u > clientnew1.files #sort is different on irix sort client.files | grep -v '/$' > client.files1 sort clientnew1.files | grep -v '/$' > clientnew1.files1 for i in *.log1.gz; do gunzip < $i; done | gawk -F\; '{ print $1 }' | sort -u > cvs.files #looks good join -v 2 cvs.files client.files1 #mozilla/nglayout.mak join -v 2 cvs.files clientnew1.files1 #mozilla/.cvsignore #mozilla/.mozconfig.mk #mozilla/.mozconfig.out #summarize all MRs perl getclientmr.perl clientnew1.files1 > all.mr1 ############################# #Now count delta and lines added ############################# #Over time #R --vsize=100M --nsize=5000k source ("script.S"); x_scan("all.mr1", what=list(mr=0,time=0,ndelta=0,nadd=0,ndel=0,nameold="",bugs="",isPr=0,isClient=0,mods="",files="",name=""),sep=";"); x$isClient1_rep(F, length(x$name));x$isClient1[x$isClient==1]_T; x$inside_rep(F, length(x$name));x$inside[grep("netscape.com",x$name)]_T; x$inside[grep("mozilla.org",x$name)]_T; x$inside[grep("cls@seawoo",x$nameold)]_T; x$insideold_rep(F, length(x$nameold));x$insideold[grep("netscape.com",x$nameold)]_T; x$insideold[grep("mozilla.org",x$nameold)]_T; x$insideold[grep("cls@seawoo",x$nameold)]_T; ################# ##make trend and code ownership plots ################# postscript("exttrend.ps",paper="special",width=12, height=9,horizontal=F); ind _ rep(T, length(x$name)); nlog_table(x$name[ind], x$insideold[ind],floor(x$time[ind]/3600/24/364.25*12)/12); nlog[nlog>0]_1; nlog_apply(nlog,c(2,3),sum); nmr_table(x$insideold[ind],floor(x$time[ind]/3600/24/364.25*12)/12); ndelta_tapply(x$ndelta[ind],list(x$insideold[ind],floor(x$time[ind]/3600/24/364.25*12)/12),sum); nadd_tapply(x$nadd[ind],list(x$insideold[ind],floor(x$time[ind]/3600/24/364.25*12)/12),sum); nlog[is.na(nlog)]_0;nmr[is.na(nmr)]_0;ndelta[is.na(ndelta)]_0;nadd[is.na(nadd)]_0; plotTrend (nlog[1,]/apply(nlog,2,sum),nmr[1,]/apply(nmr,2,sum), ndelta[1,]/apply(ndelta,2,sum),nadd[1,]/apply(nadd,2,sum), c("Fraction of external logins per month","Fraction of external MRs per month","Fraction of external deltas per month", "Fraction of external lines added per month"),""); #whats going on with line add peaks? ind _ floor(floor(x$time/3600/24/364.25*12)/12*10)==299 & !x$insideold sort(tapply(x$nadd[ind],x$name[ind],sum)) #Mozilla guys! ind _ floor(floor(x$time/3600/24/364.25*12)/12*10)==296 & !x$insideold ind1 _ ind & x$nameold=="cls@seawood.org" sort(tapply(x$nadd[ind],x$nameold[ind],sum)) ind _ floor(floor(x$time/3600/24/364.25*12)/12*10)==306 & !x$insideold ind1 _ ind & x$nameold=="cls@seawood.org" sort(tapply(x$nadd[ind],x$nameold[ind],sum)) #tim rowli from brown an somebody from ibm postscript("all.ps",paper="letter"); par(mfrow=c(2,2)); ind _ x$isClient1;i _ "Client"; #do code inside loop for (i in c(names(sort(-table(x$mods)))[1:40])){ ind _ rep(F, length(x$name));ind[grep(i,x$mods)]_T; nmr_table(x$inside[ind],floor(x$time[ind]/3600/24/364.25*12)/12); ndelta_tapply(x$ndelta[ind],list(x$inside[ind],floor(x$time[ind]/3600/24/364.25*12)/12),sum); nadd_tapply(x$nadd[ind],list(x$inside[ind],floor(x$time[ind]/3600/24/364.25*12)/12),sum); nmr[is.na(nmr)]_0;ndelta[is.na(ndelta)]_0;nadd[is.na(nadd)]_0; plotTrend (apply(nmr,2,sum), apply(ndelta,2,sum)/5,apply(nadd,2,sum)/1000, c("Number of MRs per month","Number of delta/5 per month", "Number of lines added/1000 per month"),i); plotTrend (nmr[2,]/apply(nmr,2,sum), ndelta[2,]/apply(ndelta,2,sum),nadd[2,]/apply(nadd,2,sum), c("Fractio of internal MRs per month","Fraction of internal delta per month", "Fraction of internal lines added per month"),i); nn_ names(sort(-table(x$name[ind]))); o0_length(nn); tmp_plotCode(ind,i); tmp1_plotCode(x$isPr==1&ind,i); } mods _ c("/layout","/js","/rdf","/netwerk","/editor","/intl","/xpinstall") postscript("mozcum.ps",paper="special",width=12, height=9,horizontal=F); plotCode1(mods); ################# #get size, productivity, and defect density stats ################# ind _ rep(T,length(x$name));i _ "*"; nn_ names(sort(-table(x$name[ind])));o0_length(nn);tmp_plotCode(ind,""); report(ind,i,tmp); for (i in names(sort(-table(x$mods)))[1:40]){ ind _ rep(F, length(x$name));ind[grep(i,x$mods)]_T; nn_ names(sort(-table(x$name[ind])));o0_length(nn); tmp_plotCode(ind,i); dimnames(tmp)_list(nn,c("fr","mr","d","add")); report(ind,i,tmp); } ################# #Count defect density ################# perl defectDensity.perl < all.mr > defect.density perl defectDensity.perl < all.mr1 > defect.density1 tail -58 defect.density1 | gawk -F\; 'BEGIN {printf "Module NMR\tbug/KMR bug/KDel bug/MLOC NFixers NCoders NTot \%FixInside \%CodersInside \%FixMrInside \%CodeMrInside\n";} {printf ("%18s %7d\t%7d\t%7d\t%7d\t%7d\t%7d\t%7d %8d %8d %8d %8d\n", $1, $2, $5*1000,$6*1000,$7*1000000,$8,$9,$10,$11*100,$12*100,$13*100,$14*100);}' | a2ps -r --columns=1 --font=9 -o defect.density.ps --stdin="Defect density" - #look at participation rates perl defectReporters.perl < all.mr1 |sort -t\; +0 +1 | gawk -F\; 'BEGIN {printf "Type\t\t\tName\t\t\tNReporters pctInside NBugs pctInside\n";} {printf ("%9s %30s\t%7d\t%7d\t%7d\t%7d\n", $1, $2, $3, $4*100, $5, $6*100);}' | a2ps -r --columns=1 --font=10 -o defect.reports.ps --stdin="Defect reporters" - #look at resolution interval perl defectInterval.perl < all.mr > interval perl defectInterval.perl < all.mr1 > interval1 #datef - fixed, dater - resolved, datev - verified R stuff below source("script.S"); y_scan("interval1", what=list(bug=0,pri="",o=0,ndates=0,date0=0,date1=0,c=0,nm=0,np=0,nc=0,mod1="",prod="",comp="",modc="",datef=0,dater=0,datev=0,isCh=0),sep=";"); y$isClient_rep(F,length(y$bug));y$isClient[grep("IsClient", y$modc)]_T; y$nchanges_y$isCh;y$isCh[y$isCh>0]_1; table(!is.na(y$datef[!is.na(y$o+y$c)]),!is.na(y$dater[!is.na(y$o+y$c)]),!is.na(y$datev[!is.na(y$o+y$c)])) table(list(f=!is.na(y$datef[!is.na(y$o+y$c)]),rr=!is.na(y$dater[!is.na(y$o+y$c)]),v=!is.na(y$datev[!is.na(y$o+y$c)]))) table(is.na(y$datef[!is.na(y$o+y$c)])) table(is.na(y$dater[!is.na(y$o+y$c)])) table(is.na(y$datev[!is.na(y$o+y$c)])) #What are fixed bugs that do not have trace in CVS? aa_y$bug[!is.na(y$datef)&y$isCh==0&y$datef/3600/24/365.25+70>100.7] #See reporting trends tmp_table((floor(y$o/3600/24/365.25*12)/12+1970)[!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999]); tmp1_table((floor(y$o/3600/24/365.25*12)/12+1970)[y$isClient&!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999]); tmp2_table((floor(y$o/3600/24/365.25*12)/12+1970)[y$isCh>0&!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999]); matplot(as.numeric(names(tmp))[-length(tmp)], cbind(tmp[-length(tmp)],tmp2[-length(tmp)],tmp1[-length(tmp)]),type="l", ylim=c(0,max(tmp))); plot(as.numeric(names(tmp))[-length(tmp)], tmp2[-length(tmp)]/tmp[-length(tmp)],type="l", ylim=c(0,max(tmp2/tmp))); mean(y$isCh[!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999]) #0.1838688 #divide interval into components (verification and such) ind _ !is.na(y$datev)&!is.na(y$dater)&!is.na(y$o+y$c)&y$o/3600/24/365.25+1970>1999.5 quantile(y$datev[ind]-y$dater[ind], 1:9/10)/3600/24 #0.007511574 0.145061343 0.816497685 2.028943287 4.830335648 #9.226901620 19.720754630 41.306120370 95.470750000 quantile(y$dater[ind]-y$o[ind], 1:9/10)/3600/24 #0.05114468 0.23068403 0.88466667 2.42005440 5.82220486 11.58149537 #22.06760995 41.99782407 84.05654167 #half of all PRs that visit status Verified and Resolved take #43 or less percent of the time between resolved and verified states quantile((y$datev[ind]-y$dater[ind])/(y$datev[ind]-y$o[ind]), 1:9/10) #0.00354502 0.04012123 0.12443593 0.26232221 0.43977956 0.64768240 0.81740503 #0.93207670 0.98749412 # postscript("BugInterval.ps",paper="special",width=12, height=9,horizontal=F); par(mfrow=c(2,2)); prepos_rep("before 2000", length(y$o)); prepos[y$o/3600/24/365.25+1970>2000]_"after 2000"; mods _ c("/layout","/js","/rdf","/netwerk","/editor","/intl","/xpinstall") plotBugs1(y, !is.na(y$dater)&!is.na(y$o+y$c),prepos,"", mods); postscript("BugInterval2.ps",paper="letter"); par(mfrow=c(2,2)); plotBugs(!is.na(y$dater)&!is.na(y$o+y$c)&y$o/3600/24/365.25+1970<2000,"Pre 2000"); plotBugs(!is.na(y$dater)&!is.na(y$o+y$c),"All"); #the rest not that interesting for (i in names(sort(-table(y$comp)))[1:10]){ plotBugs(!is.na(y$dater)&!is.na(y$o+y$c)&y$comp==i,paste("Component", i)); } for (i in names(sort(-table(y$prod)))[1:7]){ plotBugs(!is.na(y$dater)&!is.na(y$o+y$c)&y$prod==i,paste("Product", i)); } table((y$o/3600/24/365.25+1970<2000)[!is.na(y$dater)&!is.na(y$o+y$c)]) table(y$isCh>0,y$isClient); # 0 1 #0 47403 0 #1 6415 4149 table(y$isCh>0); # 0 1 # 47403 10564 table(y$pri, y$isCh>0)[,2]/(table(y$pri, y$isCh>0)[,1]+table(y$pri, y$isCh>0)[,2]); # P1 P2 P3 P4 P5 # 0.3875442 0.2794606 0.1570678 0.2084691 0.1215278 tmp_table(y$pri[y$isCh>0], y$isClient[y$isCh>0]); tmp[,2]/(tmp[,1]+tmp[,2]); # P1 P2 P3 P4 P5 #0.4266667 0.481069 0.3720503 0.3046875 0.4571429 table(y$pri); # P1 P2 P3 P4 P5 # 3677 4820 48565 614 288 sort(table(y$mod1)); sort(table(y$prod)); sort(table(y$comp));