From fb323093c72ef7037dd7f49557af3126638f4e99 Mon Sep 17 00:00:00 2001 From: Rhys Smith Date: Mon, 3 Apr 2017 10:46:38 +0100 Subject: [PATCH] Ignore common bots in website stats --- utilities/stats-generate.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/utilities/stats-generate.sh b/utilities/stats-generate.sh index 8e09134b..a021ce27 100755 --- a/utilities/stats-generate.sh +++ b/utilities/stats-generate.sh @@ -682,24 +682,27 @@ fi # Website stats # ===== +# Set up grepping out bots +botstringlist="(Googlebot|Bingbo|DuckDuckBot|Baiduspider|Yandexbot|Sogou|Exabot|AhrefsBot|seoscanners)" + # How many requests were there for the main content files? -wwwaccesscount=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* $logslocation/www/web2/ssl_access_log* $logslocation/www/www-ne-01/ssl_access_log* $logslocation/www/www-we-01/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) +wwwaccesscount=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* $logslocation/www/web2/ssl_access_log* $logslocation/www/www-ne-01/ssl_access_log* $logslocation/www/www-we-01/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) wwwaccesscountfriendly=$(echo $wwwaccesscount | awk '{ printf ("%'"'"'d\n", $0) }') # And from how many unique IdPs? -wwwaccessipcount=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* $logslocation/www/web2/ssl_access_log* $logslocation/www/www-ne-01/ssl_access_log* $logslocation/www/www-we-01/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | cut -f 1 -d " " | cut -f 2-9 -d ":" | sort | uniq | wc -l | awk '{ printf ("%'"'"'d\n", $0) }') +wwwaccessipcount=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* $logslocation/www/web2/ssl_access_log* $logslocation/www/www-ne-01/ssl_access_log* $logslocation/www/www-we-01/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | cut -f 1 -d " " | cut -f 2-9 -d ":" | sort | uniq | wc -l | awk '{ printf ("%'"'"'d\n", $0) }') # Don't count these when doing daily stats if [[ "$timeperiod" != "day" ]]; then # Per-server request count - wwwaccessweb1count=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) + wwwaccessweb1count=$(grep $apachesearchterm $logslocation/www/web1/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) wwwaccessweb1pc=$(echo "scale=4;($wwwaccessweb1count/$wwwaccesscount)*100" | bc | awk '{printf "%.1f\n", $0}') - wwwaccessweb2count=$(grep $apachesearchterm $logslocation/www/web2/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) + wwwaccessweb2count=$(grep $apachesearchterm $logslocation/www/web2/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) wwwaccessweb2pc=$(echo "scale=4;($wwwaccessweb2count/$wwwaccesscount)*100" | bc | awk '{printf "%.1f\n", $0}') - wwwaccessne01count=$(grep $apachesearchterm $logslocation/www/www-ne-01/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) + wwwaccessne01count=$(grep $apachesearchterm $logslocation/www/www-ne-01/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) wwwaccessne01pc=$(echo "scale=4;($wwwaccessne01count/$wwwaccesscount)*100" | bc | awk '{printf "%.1f\n", $0}') - wwwaccesswe01count=$(grep $apachesearchterm $logslocation/www/www-we-01/ssl_access_log* | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) + wwwaccesswe01count=$(grep $apachesearchterm $logslocation/www/www-we-01/ssl_access_log* | grep -Eiv "$botstringlist" | grep -Ev "(Sensu-HTTP-Check|dummy|check_http|Balancer)" | grep 200 | grep "/content/" | wc -l) wwwaccesswe01pc=$(echo "scale=4;($wwwaccesswe01count/$wwwaccesscount)*100" | bc | awk '{printf "%.1f\n", $0}') fi