Skip to content

Commit

Permalink
Ignore common bots in website stats
Browse files Browse the repository at this point in the history
  • Loading branch information
Rhys Smith committed Apr 3, 2017
1 parent 5fd9f02 commit fb32309
Showing 1 changed file with 9 additions and 6 deletions.
15 changes: 9 additions & 6 deletions utilities/stats-generate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -682,24 +682,27 @@ fi
# Website stats
# =====

# Set up grepping out bots
botstringlist="(Googlebot|Bingbo|DuckDuckBot|Baiduspider|Yandexbot|Sogou|Exabot|AhrefsBot|seoscanners)"

# How many requests were there for the main content files?
wwwaccesscount=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* $logslocation/www/web2/ssl_access_log* $logslocation/www/www-ne-01/ssl_access_log* $logslocation/www/www-we-01/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccesscount=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* $logslocation/www/web2/ssl_access_log* $logslocation/www/www-ne-01/ssl_access_log* $logslocation/www/www-we-01/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccesscountfriendly=$(echo $wwwaccesscount | awk '{ printf ("%'"'"'d\n", $0) }')

# And from how many unique IdPs?
wwwaccessipcount=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* $logslocation/www/web2/ssl_access_log* $logslocation/www/www-ne-01/ssl_access_log* $logslocation/www/www-we-01/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | cut -f 1 -d " " | cut -f 2-9 -d ":" | sort | uniq | wc -l | awk '{ printf ("%'"'"'d\n", $0) }')
wwwaccessipcount=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* $logslocation/www/web2/ssl_access_log* $logslocation/www/www-ne-01/ssl_access_log* $logslocation/www/www-we-01/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | cut -f 1 -d " " | cut -f 2-9 -d ":" | sort | uniq | wc -l | awk '{ printf ("%'"'"'d\n", $0) }')

# Don't count these when doing daily stats
if [[ "$timeperiod" != "day" ]]; then

# Per-server request count
wwwaccessweb1count=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccessweb1count=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccessweb1pc=$(echo "scale=4;($wwwaccessweb1count/$wwwaccesscount)*100" | bc | awk '{printf "%.1f\n", $0}')
wwwaccessweb2count=$(grep $apachesearchterm $logslocation/www/web2/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccessweb2count=$(grep $apachesearchterm $logslocation/www/web2/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccessweb2pc=$(echo "scale=4;($wwwaccessweb2count/$wwwaccesscount)*100" | bc | awk '{printf "%.1f\n", $0}')
wwwaccessne01count=$(grep $apachesearchterm $logslocation/www/www-ne-01/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccessne01count=$(grep $apachesearchterm $logslocation/www/www-ne-01/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccessne01pc=$(echo "scale=4;($wwwaccessne01count/$wwwaccesscount)*100" | bc | awk '{printf "%.1f\n", $0}')
wwwaccesswe01count=$(grep $apachesearchterm $logslocation/www/www-we-01/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccesswe01count=$(grep $apachesearchterm $logslocation/www/www-we-01/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l)
wwwaccesswe01pc=$(echo "scale=4;($wwwaccesswe01count/$wwwaccesscount)*100" | bc | awk '{printf "%.1f\n", $0}')
fi

Expand Down

0 comments on commit fb32309

Please sign in to comment.