1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
#!/usr/bin/python
# get-www-stats - Debian web site popularity statistics
# Copyright 2010 Marcin Owsiany <porridge@debian.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This program is run from a crontab on a Debian website mirror like this:
#
# MAILTO="porridge@debian.org"
# # Atomically and concurrent-safely create a stats.tgz
# 18 3 * * * cd "$HOME" && d=$(mktemp -d stats-wip-XXXXXXXXXX) && printf '{"hostname":"\%s"}' $(hostname -f) > "$d/stats.meta.json" && ./get-www-stats > "$d/stats.json" && tar zcf stats-wip.tgz "$d" && rm -rf "$d" && mv stats-wip.tgz stats.tgz
#
# And the output is transferred to dde.debian.net like this:
#
# MAILTO="porridge@debian.org"
# # Atomically transfer stats and replace them.
# 18 4 * * * cd $HOME && { [ ! -e stats-old ] || please_cleanup_failed_run ; } && cp -al stats-new stats-old && ln -s stats-old stats-old.s && mv -T stats-old.s stats && { scp -q -i .ssh/stats-transfer-nopass senfl.debian.org:stats.tgz stats.tgz || scp_failed ; } && rm -rf stats-new && mkdir stats-new && tar zxf stats.tgz -C stats-new --strip-components=1 && rm stats.tgz && ln -s stats-new stats-new.s && mv -T stats-new.s stats && rm -rf stats-old
#
# The output is then exported via DDE (see http://wiki.debian.org/DDE) and used
# by the stattrans.pl script to sort the page lists in the Debian web site
# translation statistics pages.
try:
import json
except ImportError:
import simplejson as json
from gzip import open as gzopen
import logging
import os
import re
import sys
#logging.basicConfig(level=logging.INFO)
logs_dir = '/var/log/apache2'
logs_prefix = 'www.debian.org-access.log'
logs_count = 10
logs = []
for f in os.listdir(logs_dir):
if not f.startswith(logs_prefix):
continue
parts = f.split('-')
if len(parts) == 2:
logs.append((99999999, f, False))
elif len(parts) == 3:
if f.endswith('.gz'):
gzipped = True
stamp = parts[2][:-3]
else:
gzipped = False
stamp = parts[2]
logs.append((int(stamp), f, gzipped))
else:
logging.warn('Skipping unexpected filename [%s].' % f)
counts = {}
for n, f, gzipped in sorted(logs)[-logs_count:]:
logfile = os.path.join(logs_dir, f)
logging.info('Reading %s.' % logfile)
opener = gzipped and gzopen or open
for line in opener(logfile):
line = line.rstrip()
tokens = line.split()
url = tokens[6]
url = re.sub(r'\...\.html$', '', url)
url = re.sub(r'/$', '/index', url)
if url in counts:
counts[url] += 1
else:
counts[url] = 1
if '/index' not in counts:
raise Exception('No data for /index')
elif counts['/index'] < 50000:
logging.warn('Less than 50k hits for /index')
elif counts['/index'] < 10000:
raise Exception('Less than 10k hits for /index')
json.dump(sorted([(v, k) for (k, v) in counts.iteritems() if v > 2], reverse=True),
sys.stdout,
indent=2)
# for v, k in sorted([(v, k) for (k, v) in counts.iteritems()], reverse=True):
# print '%8d %s' % (v, k)
# if v < 3:
# break
# Perl original:
# @f=split;
# $s = $f[6];
# $s =~ s,\...\.html,,;
# $s =~ s,/$,/index,;
# $S{$s} += 1;
# END{
# printf "%d normalized URLs\n", scalar keys %S;
# foreach my $k (sort { $S{$b} <=> $S{$a} } keys %S) {
# printf "%8d %s\n", $S{$k}, $k
# }
# }
|