From 3857af9b9941c975accd3442c817095e664b25b0 Mon Sep 17 00:00:00 2001
From: Tom Scavo <trscavo@internet2.edu>
Date: Sat, 17 Jun 2017 16:36:04 -0400
Subject: [PATCH] Implement support for HTTP Compression

---
 lib/http_tools.sh | 140 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 100 insertions(+), 40 deletions(-)

diff --git a/lib/http_tools.sh b/lib/http_tools.sh
index 9812452..480d351 100755
--- a/lib/http_tools.sh
+++ b/lib/http_tools.sh
@@ -23,9 +23,9 @@
 # issue an ordinary GET request for the resource. In either case, if 
 # the server responds with 200, cache the resource and return the 
 # response body. If the server responds with 304, return the cached 
-# resource instead.
+# response body instead.
 #
-# Usage: conditional_get [-v] [-F | -C] [-I] -d CACHE_DIR -T TMP_DIR HTTP_LOCATION
+# Usage: conditional_get [-vFCIx] -d CACHE_DIR -T TMP_DIR HTTP_LOCATION
 #
 # This function requires two option arguments (CACHE_DIR and TMP_DIR)
 # and a command-line argument (HTTP_LOCATION). The rest of the command
@@ -33,31 +33,58 @@
 #
 # Options:
 #   -v   verbose mode
-#   -F   force the return of a fresh resource
+#   -F   force the return of fresh content
 #   -C   check the cache only
-#   -I   return headers only
+#   -I   get and return headers only
+#   -x   enable HTTP Compression
 #   -d   the cache directory (REQUIRED)
 #   -T   a temporary directory (REQUIRED)
 #
-# Use option -F, -C, or -I to alter the default behavior of the function.
-# Options -F and -C are mutually exclusive of each another. Option -I
-# may be used with option -C (but not with option -F).
+# Use option -F, -C, or -I to alter the default behavior of the 
+# function. Options -F and -C are mutually exclusive of each other. 
+# Option -I may be used with option -C (but not with option -F).
 #
-# Option -F forces the return of a fresh resource, that is, if the
-# server responds with 304, the function quietly returns with a nonzero 
-# return code.
+# Option -F forces the return of fresh content; that is, if option -F
+# is enabled, and the server responds with 304, the function quietly 
+# returns with a nonzero return code.
 #
-# Option -C causes the function to go directly to cache. No HTTP request
-# is issued. (This option is useful in offline mode.) If the resource
-# is not cached, the function quietly returns with a nonzero return code.
+# Option -C causes the function to go directly to cache; that is, no 
+# HTTP request is issued. (This option is useful in offline mode.) 
+# If the resource is not cached, the function quietly returns with a 
+# nonzero return code.
 #
-# Option -I issues a HEAD request instead of a GET request. In that case,
-# only the response headers are returned in the output. Nothing is written
-# when option -I is used.
+# Option -I issues a HEAD request instead of a GET request, in which case,
+# only the response headers are returned in the output. Note that nothing 
+# is written to cache when option -I is used.
 #
-# If options -I and -C are used together, the cached headers are returned 
-# instead. As with option -C alone, if the resource is not cached, the 
-# function quietly returns with a nonzero return code.
+# If options -I and -C are used together, the cached headers are returned.
+# As with option -C alone, if the resource is not cached, the function 
+# quietly returns with a nonzero return code.
+#
+# HTTP COMPRESSION
+#
+# Option -x adds an Accept-Encoding header to the request; that is, if
+# option -x is enabled, the client merely indicates its support for HTTP 
+# Compression in the request. The server may or may not compress the 
+# response. This implementation does not check to see if the response 
+# was in fact compressed. The HTTP response header will indicate if this 
+# is so.
+#
+# Important! This implementation treats compressed and uncompressed 
+# requests for the same resource as two distinct resources. For example, 
+# consider the following pair of function calls:
+#
+#   conditional_get $url
+#   conditional_get -x $url
+#
+# The above requests result in two distinct cached resources, the content
+# of which are identical. Assuming the server actually compressed the
+# response of the latter, the headers will be different, however. In 
+# particular, the Content-Length values will be different in each case. 
+# Most importantly, the compressed response header will include a 
+# Content-Encoding header (whose value is invariably "gzip").
+#
+# OUTPUT
 #
 # The output of the curl command-line tool is stored in the following 
 # temporary files:
@@ -66,6 +93,8 @@
 #   $TMP_DIR/conditional_get_curl_content
 #   $TMP_DIR/conditional_get_curl_stderr
 #
+# DEPENDENCIES
+#
 # This function requires the following library file:
 #
 # core_lib.sh
@@ -115,6 +144,7 @@ conditional_get () {
 	local tmp_content_file
 	local tmp_stderr_file
 	local curl_opts
+	local adjective
 	local do_conditional_get
 	local header_value
 	local cmd
@@ -126,6 +156,7 @@ conditional_get () {
 	local force_output_mode=false
 	local cache_only_mode=false
 	local headers_only_mode=false
+	local compressed_mode=false
 	local cache_dir
 	local tmp_dir
 	local location
@@ -133,7 +164,7 @@ conditional_get () {
 	local opt
 	local OPTARG
 	local OPTIND
-	while getopts ":vFCId:T:" opt; do
+	while getopts ":vFCIxd:T:" opt; do
 		case $opt in
 			v)
 				verbose_mode=true
@@ -149,6 +180,9 @@ conditional_get () {
 			I)
 				headers_only_mode=true
 				;;
+			x)
+				compressed_mode=true
+				;;
 			d)
 				cache_dir="$OPTARG"
 				;;
@@ -212,8 +246,12 @@ conditional_get () {
 	# Determine the cache files (which may or may not exist at this point)
 	#
 	# This cache implementation uses separate files for the header and
-	# body content. Does it make sense to cache a single file instead?
-	# Also, should we use SHA-1 instead of MD5?
+	# body content. It also uses a separate pair of files if option -x
+	# (i.e., HTTP Compression) is specified on the command line.
+	#
+	# Open Questions
+	#   Does it make sense to cache a single file instead?
+	#   Should we use SHA-1 instead of MD5?
 	#
 	#######################################################################
 
@@ -227,8 +265,14 @@ conditional_get () {
 		return 4
 	fi
 
-	cached_header_file="$cache_dir/${hash}_headers"
-	cached_content_file="$cache_dir/${hash}_content"
+	# use distinct cache filenames for compressed mode
+	if $compressed_mode; then
+		cached_header_file="$cache_dir/${hash}_headers_compressed"
+		cached_content_file="$cache_dir/${hash}_content_compressed"
+	else
+		cached_header_file="$cache_dir/${hash}_headers"
+		cached_content_file="$cache_dir/${hash}_content"
+	fi
 
 	print_log_message -D "$FUNCNAME using cached header file: $cached_header_file"
 	print_log_message -D "$FUNCNAME using cached content file: $cached_content_file"
@@ -283,6 +327,7 @@ conditional_get () {
 	#######################################################################
 	#
 	# GET the web resource
+	# If option -I was used, issue HEAD request instead
 	#
 	# This implementation issues an HTTP Conditional GET request iff
 	# the resource is cached.
@@ -297,15 +342,21 @@ conditional_get () {
 	fi
 	curl_opts="${curl_opts} --user-agent '${user_agent_string}'"
 	
+	# set curl --compressed option if necessary
+	if $compressed_mode; then
+		adjective="compressed "
+		curl_opts="${curl_opts} --compressed"
+	fi
+
 	# always capture the header in a file
 	# capture the output iff the script issues a GET request
 	curl_opts="${curl_opts} --dump-header '${tmp_header_file}'"
 	if $headers_only_mode; then
-		print_log_message -I "$FUNCNAME issuing HEAD request for resource: $location"
+		print_log_message -I "$FUNCNAME issuing HEAD request for ${adjective}resource: $location"
 		curl_opts="${curl_opts} --head"
 		curl_opts="${curl_opts} --output '/dev/null'"
 	else
-		print_log_message -I "$FUNCNAME issuing GET request for resource: $location"
+		print_log_message -I "$FUNCNAME issuing GET request for ${adjective}resource: $location"
 		curl_opts="${curl_opts} --output '${tmp_content_file}'"
 	fi
 	curl_opts="${curl_opts} --stderr '${tmp_stderr_file}'"
@@ -390,27 +441,36 @@ conditional_get () {
 
 	if [ "$response_code" = "200" ]; then
 
-		# sanity check
-		declared_content_length=$( get_header_value "$tmp_header_file" 'Content-Length' )
-		return_code=$?
-		if [ $return_code -ne 0 ]; then
-			print_log_message -E "$FUNCNAME: get_header_value failed (return code: $return_code)"
-			return 6
-		fi
+		# compute the length of the downloaded content
 		actual_content_length=$( /bin/cat "$tmp_content_file" \
 			| /usr/bin/wc -c \
 			| $_SED -e 's/^[ ]*//' -e 's/[ ]*$//'
 		)
-		if [ -n "$declared_content_length" ]; then
-			if [ "$declared_content_length" != "$actual_content_length" ]; then
-				print_log_message -E "$FUNCNAME failed content length check"
-				return 3
+		return_code=$?
+		if [ $return_code -ne 0 ]; then
+			print_log_message -E "$FUNCNAME: length calculation failed (return code: $return_code)"
+			return 3
+		fi
+		print_log_message -D "$FUNCNAME downloaded ${actual_content_length} bytes"
+
+		# this sanity check is applied only if option -x was NOT used
+		if ! $compressed_mode; then
+			declared_content_length=$( get_header_value "$tmp_header_file" 'Content-Length' )
+			return_code=$?
+			if [ $return_code -ne 0 ]; then
+				print_log_message -E "$FUNCNAME: get_header_value failed (return code: $return_code)"
+				return 6
+			fi
+			if [ -n "$declared_content_length" ]; then
+				if [ "$declared_content_length" != "$actual_content_length" ]; then
+					print_log_message -E "$FUNCNAME failed content length check"
+					return 3
+				fi
+			else
+				print_log_message -W "$FUNCNAME: Content-Length response header missing"
 			fi
-		else
-			print_log_message -W "$FUNCNAME: Content-Length response header missing"
 		fi
 
-		print_log_message -D "$FUNCNAME downloaded ${actual_content_length} bytes"
 		if $do_conditional_get; then
 			print_log_message -D "$FUNCNAME refreshing cache files"
 		else