Brian Downing | b492bbd | 2007-07-12 09:16:11 -0500 | [diff] [blame] | 1 | #!/usr/bin/perl |
Brian Downing | 73f8936 | 2007-07-11 22:02:25 -0500 | [diff] [blame] | 2 | # |
| 3 | # This tool will print vaguely pretty information about a pack. It |
Dan McGee | 3b1eb12 | 2008-10-17 21:41:18 -0500 | [diff] [blame] | 4 | # expects the output of "git verify-pack -v" as input on stdin. |
Brian Downing | 73f8936 | 2007-07-11 22:02:25 -0500 | [diff] [blame] | 5 | # |
Dan McGee | 3b1eb12 | 2008-10-17 21:41:18 -0500 | [diff] [blame] | 6 | # $ git verify-pack -v | packinfo.pl |
Brian Downing | 73f8936 | 2007-07-11 22:02:25 -0500 | [diff] [blame] | 7 | # |
| 8 | # This prints some full-pack statistics; currently "all sizes", "all |
| 9 | # path sizes", "tree sizes", "tree path sizes", and "depths". |
| 10 | # |
| 11 | # * "all sizes" stats are across every object size in the file; |
| 12 | # full sizes for base objects, and delta size for deltas. |
| 13 | # * "all path sizes" stats are across all object's "path sizes". |
| 14 | # A path size is the sum of the size of the delta chain, including the |
| 15 | # base object. In other words, it's how many bytes need be read to |
| 16 | # reassemble the file from deltas. |
| 17 | # * "tree sizes" are object sizes grouped into delta trees. |
| 18 | # * "tree path sizes" are path sizes grouped into delta trees. |
| 19 | # * "depths" should be obvious. |
| 20 | # |
| 21 | # When run as: |
| 22 | # |
Dan McGee | 3b1eb12 | 2008-10-17 21:41:18 -0500 | [diff] [blame] | 23 | # $ git verify-pack -v | packinfo.pl -tree |
Brian Downing | 73f8936 | 2007-07-11 22:02:25 -0500 | [diff] [blame] | 24 | # |
| 25 | # the trees of objects are output along with the stats. This looks |
| 26 | # like: |
| 27 | # |
| 28 | # 0 commit 031321c6... 803 803 |
| 29 | # |
| 30 | # 0 blob 03156f21... 1767 1767 |
| 31 | # 1 blob f52a9d7f... 10 1777 |
| 32 | # 2 blob a8cc5739... 51 1828 |
| 33 | # 3 blob 660e90b1... 15 1843 |
| 34 | # 4 blob 0cb8e3bb... 33 1876 |
| 35 | # 2 blob e48607f0... 311 2088 |
| 36 | # size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 |
| 37 | # path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 |
| 38 | # |
| 39 | # The first number after the sha1 is the object size, the second |
| 40 | # number is the path size. The statistics are across all objects in |
| 41 | # the previous delta tree. Obviously they are omitted for trees of |
| 42 | # one object. |
| 43 | # |
| 44 | # When run as: |
| 45 | # |
Dan McGee | 3b1eb12 | 2008-10-17 21:41:18 -0500 | [diff] [blame] | 46 | # $ git verify-pack -v | packinfo.pl -tree -filenames |
Brian Downing | 73f8936 | 2007-07-11 22:02:25 -0500 | [diff] [blame] | 47 | # |
| 48 | # it adds filenames to the tree. Getting this information is slow: |
| 49 | # |
| 50 | # 0 blob 03156f21... 1767 1767 Documentation/git-lost-found.txt @ tags/v1.2.0~142 |
| 51 | # 1 blob f52a9d7f... 10 1777 Documentation/git-lost-found.txt @ tags/v1.5.0-rc1~74 |
| 52 | # 2 blob a8cc5739... 51 1828 Documentation/git-lost+found.txt @ tags/v0.99.9h^0 |
| 53 | # 3 blob 660e90b1... 15 1843 Documentation/git-lost+found.txt @ master~3222^2~2 |
| 54 | # 4 blob 0cb8e3bb... 33 1876 Documentation/git-lost+found.txt @ master~3222^2~3 |
| 55 | # 2 blob e48607f0... 311 2088 Documentation/git-lost-found.txt @ tags/v1.5.2-rc3~4 |
| 56 | # size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 |
| 57 | # path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 |
| 58 | # |
| 59 | # When run as: |
| 60 | # |
Dan McGee | 3b1eb12 | 2008-10-17 21:41:18 -0500 | [diff] [blame] | 61 | # $ git verify-pack -v | packinfo.pl -dump |
Brian Downing | 73f8936 | 2007-07-11 22:02:25 -0500 | [diff] [blame] | 62 | # |
| 63 | # it prints out "sha1 size pathsize depth" for each sha1 in lexical |
| 64 | # order. |
| 65 | # |
| 66 | # 000079a2eaef17b7eae70e1f0f635557ea67b644 30 472 7 |
| 67 | # 00013cafe6980411aa6fdd940784917b5ff50f0a 44 1542 4 |
| 68 | # 000182eacf99cde27d5916aa415921924b82972c 499 499 0 |
| 69 | # ... |
| 70 | # |
| 71 | # This is handy for comparing two packs. Adding "-filenames" will add |
| 72 | # filenames, as per "-tree -filenames" above. |
| 73 | |
| 74 | use strict; |
| 75 | use Getopt::Long; |
| 76 | |
| 77 | my $filenames = 0; |
| 78 | my $tree = 0; |
| 79 | my $dump = 0; |
| 80 | GetOptions("tree" => \$tree, |
| 81 | "filenames" => \$filenames, |
| 82 | "dump" => \$dump); |
| 83 | |
| 84 | my %parents; |
| 85 | my %children; |
| 86 | my %sizes; |
| 87 | my @roots; |
| 88 | my %paths; |
| 89 | my %types; |
| 90 | my @commits; |
| 91 | my %names; |
| 92 | my %depths; |
| 93 | my @depths; |
| 94 | |
| 95 | while (<STDIN>) { |
Nicolas Pitre | 5f4347b | 2008-02-28 00:25:20 -0500 | [diff] [blame] | 96 | my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_); |
Brian Downing | 73f8936 | 2007-07-11 22:02:25 -0500 | [diff] [blame] | 97 | next unless ($sha1 =~ /^[0-9a-f]{40}$/); |
| 98 | $depths{$sha1} = $depth || 0; |
| 99 | push(@depths, $depth || 0); |
| 100 | push(@commits, $sha1) if ($type eq 'commit'); |
| 101 | push(@roots, $sha1) unless $parent; |
| 102 | $parents{$sha1} = $parent; |
| 103 | $types{$sha1} = $type; |
| 104 | push(@{$children{$parent}}, $sha1); |
| 105 | $sizes{$sha1} = $size; |
| 106 | } |
| 107 | |
| 108 | if ($filenames && ($tree || $dump)) { |
Dan McGee | 3b1eb12 | 2008-10-17 21:41:18 -0500 | [diff] [blame] | 109 | open(NAMES, "git name-rev --all|"); |
Brian Downing | 73f8936 | 2007-07-11 22:02:25 -0500 | [diff] [blame] | 110 | while (<NAMES>) { |
| 111 | if (/^(\S+)\s+(.*)$/) { |
| 112 | my ($sha1, $name) = ($1, $2); |
| 113 | $names{$sha1} = $name; |
| 114 | } |
| 115 | } |
| 116 | close NAMES; |
| 117 | |
| 118 | for my $commit (@commits) { |
| 119 | my $name = $names{$commit}; |
Dan McGee | 3b1eb12 | 2008-10-17 21:41:18 -0500 | [diff] [blame] | 120 | open(TREE, "git ls-tree -t -r $commit|"); |
Brian Downing | 73f8936 | 2007-07-11 22:02:25 -0500 | [diff] [blame] | 121 | print STDERR "Plumbing tree $name\n"; |
| 122 | while (<TREE>) { |
| 123 | if (/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) { |
| 124 | my ($mode, $type, $sha1, $path) = ($1, $2, $3, $4); |
| 125 | $paths{$sha1} = "$path @ $name"; |
| 126 | } |
| 127 | } |
| 128 | close TREE; |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | sub stats { |
| 133 | my @data = sort {$a <=> $b} @_; |
| 134 | my $min = $data[0]; |
| 135 | my $max = $data[$#data]; |
| 136 | my $total = 0; |
| 137 | my $count = scalar @data; |
| 138 | for my $datum (@data) { |
| 139 | $total += $datum; |
| 140 | } |
| 141 | my $mean = $total / $count; |
| 142 | my $median = $data[int(@data / 2)]; |
| 143 | my $diff_sum = 0; |
| 144 | for my $datum (@data) { |
| 145 | $diff_sum += ($datum - $mean)**2; |
| 146 | } |
| 147 | my $std_dev = sqrt($diff_sum / $count); |
| 148 | return ($count, $total, $min, $max, $mean, $median, $std_dev); |
| 149 | } |
| 150 | |
| 151 | sub print_stats { |
| 152 | my $name = shift; |
| 153 | my ($count, $total, $min, $max, $mean, $median, $std_dev) = stats(@_); |
| 154 | printf("%s: count %s total %s min %s max %s mean %.2f median %s std_dev %.2f\n", |
| 155 | $name, $count, $total, $min, $max, $mean, $median, $std_dev); |
| 156 | } |
| 157 | |
| 158 | my @sizes; |
| 159 | my @path_sizes; |
| 160 | my @all_sizes; |
| 161 | my @all_path_sizes; |
| 162 | my %path_sizes; |
| 163 | |
| 164 | sub dig { |
| 165 | my ($sha1, $depth, $path_size) = @_; |
| 166 | $path_size += $sizes{$sha1}; |
| 167 | push(@sizes, $sizes{$sha1}); |
| 168 | push(@all_sizes, $sizes{$sha1}); |
| 169 | push(@path_sizes, $path_size); |
| 170 | push(@all_path_sizes, $path_size); |
| 171 | $path_sizes{$sha1} = $path_size; |
| 172 | if ($tree) { |
| 173 | printf("%3d%s %6s %s %8d %8d %s\n", |
| 174 | $depth, (" " x $depth), $types{$sha1}, |
| 175 | $sha1, $sizes{$sha1}, $path_size, $paths{$sha1}); |
| 176 | } |
| 177 | for my $child (@{$children{$sha1}}) { |
| 178 | dig($child, $depth + 1, $path_size); |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | my @tree_sizes; |
| 183 | my @tree_path_sizes; |
| 184 | |
| 185 | for my $root (@roots) { |
| 186 | undef @sizes; |
| 187 | undef @path_sizes; |
| 188 | dig($root, 0, 0); |
| 189 | my ($aa, $sz_total) = stats(@sizes); |
| 190 | my ($bb, $psz_total) = stats(@path_sizes); |
| 191 | push(@tree_sizes, $sz_total); |
| 192 | push(@tree_path_sizes, $psz_total); |
| 193 | if ($tree) { |
| 194 | if (@sizes > 1) { |
| 195 | print_stats(" size", @sizes); |
| 196 | print_stats("path size", @path_sizes); |
| 197 | } |
| 198 | print "\n"; |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | if ($dump) { |
| 203 | for my $sha1 (sort keys %sizes) { |
| 204 | print "$sha1 $sizes{$sha1} $path_sizes{$sha1} $depths{$sha1} $paths{$sha1}\n"; |
| 205 | } |
| 206 | } else { |
| 207 | print_stats(" all sizes", @all_sizes); |
| 208 | print_stats(" all path sizes", @all_path_sizes); |
| 209 | print_stats(" tree sizes", @tree_sizes); |
| 210 | print_stats("tree path sizes", @tree_path_sizes); |
| 211 | print_stats(" depths", @depths); |
| 212 | } |