From 3857af9b9941c975accd3442c817095e664b25b0 Mon Sep 17 00:00:00 2001 From: Tom Scavo <trscavo@internet2.edu> Date: Sat, 17 Jun 2017 16:36:04 -0400 Subject: [PATCH] Implement support for HTTP Compression --- lib/http_tools.sh | 140 +++++++++++++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 40 deletions(-) diff --git a/lib/http_tools.sh b/lib/http_tools.sh index 9812452..480d351 100755 --- a/lib/http_tools.sh +++ b/lib/http_tools.sh @@ -23,9 +23,9 @@ # issue an ordinary GET request for the resource. In either case, if # the server responds with 200, cache the resource and return the # response body. If the server responds with 304, return the cached -# resource instead. +# response body instead. # -# Usage: conditional_get [-v] [-F | -C] [-I] -d CACHE_DIR -T TMP_DIR HTTP_LOCATION +# Usage: conditional_get [-vFCIx] -d CACHE_DIR -T TMP_DIR HTTP_LOCATION # # This function requires two option arguments (CACHE_DIR and TMP_DIR) # and a command-line argument (HTTP_LOCATION). The rest of the command @@ -33,31 +33,58 @@ # # Options: # -v verbose mode -# -F force the return of a fresh resource +# -F force the return of fresh content # -C check the cache only -# -I return headers only +# -I get and return headers only +# -x enable HTTP Compression # -d the cache directory (REQUIRED) # -T a temporary directory (REQUIRED) # -# Use option -F, -C, or -I to alter the default behavior of the function. -# Options -F and -C are mutually exclusive of each another. Option -I -# may be used with option -C (but not with option -F). +# Use option -F, -C, or -I to alter the default behavior of the +# function. Options -F and -C are mutually exclusive of each other. +# Option -I may be used with option -C (but not with option -F). # -# Option -F forces the return of a fresh resource, that is, if the -# server responds with 304, the function quietly returns with a nonzero -# return code. +# Option -F forces the return of fresh content; that is, if option -F +# is enabled, and the server responds with 304, the function quietly +# returns with a nonzero return code. # -# Option -C causes the function to go directly to cache. No HTTP request -# is issued. (This option is useful in offline mode.) If the resource -# is not cached, the function quietly returns with a nonzero return code. +# Option -C causes the function to go directly to cache; that is, no +# HTTP request is issued. (This option is useful in offline mode.) +# If the resource is not cached, the function quietly returns with a +# nonzero return code. # -# Option -I issues a HEAD request instead of a GET request. In that case, -# only the response headers are returned in the output. Nothing is written -# when option -I is used. +# Option -I issues a HEAD request instead of a GET request, in which case, +# only the response headers are returned in the output. Note that nothing +# is written to cache when option -I is used. # -# If options -I and -C are used together, the cached headers are returned -# instead. As with option -C alone, if the resource is not cached, the -# function quietly returns with a nonzero return code. +# If options -I and -C are used together, the cached headers are returned. +# As with option -C alone, if the resource is not cached, the function +# quietly returns with a nonzero return code. +# +# HTTP COMPRESSION +# +# Option -x adds an Accept-Encoding header to the request; that is, if +# option -x is enabled, the client merely indicates its support for HTTP +# Compression in the request. The server may or may not compress the +# response. This implementation does not check to see if the response +# was in fact compressed. The HTTP response header will indicate if this +# is so. +# +# Important! This implementation treats compressed and uncompressed +# requests for the same resource as two distinct resources. For example, +# consider the following pair of function calls: +# +# conditional_get $url +# conditional_get -x $url +# +# The above requests result in two distinct cached resources, the content +# of which are identical. Assuming the server actually compressed the +# response of the latter, the headers will be different, however. In +# particular, the Content-Length values will be different in each case. +# Most importantly, the compressed response header will include a +# Content-Encoding header (whose value is invariably "gzip"). +# +# OUTPUT # # The output of the curl command-line tool is stored in the following # temporary files: @@ -66,6 +93,8 @@ # $TMP_DIR/conditional_get_curl_content # $TMP_DIR/conditional_get_curl_stderr # +# DEPENDENCIES +# # This function requires the following library file: # # core_lib.sh @@ -115,6 +144,7 @@ conditional_get () { local tmp_content_file local tmp_stderr_file local curl_opts + local adjective local do_conditional_get local header_value local cmd @@ -126,6 +156,7 @@ conditional_get () { local force_output_mode=false local cache_only_mode=false local headers_only_mode=false + local compressed_mode=false local cache_dir local tmp_dir local location @@ -133,7 +164,7 @@ conditional_get () { local opt local OPTARG local OPTIND - while getopts ":vFCId:T:" opt; do + while getopts ":vFCIxd:T:" opt; do case $opt in v) verbose_mode=true @@ -149,6 +180,9 @@ conditional_get () { I) headers_only_mode=true ;; + x) + compressed_mode=true + ;; d) cache_dir="$OPTARG" ;; @@ -212,8 +246,12 @@ conditional_get () { # Determine the cache files (which may or may not exist at this point) # # This cache implementation uses separate files for the header and - # body content. Does it make sense to cache a single file instead? - # Also, should we use SHA-1 instead of MD5? + # body content. It also uses a separate pair of files if option -x + # (i.e., HTTP Compression) is specified on the command line. + # + # Open Questions + # Does it make sense to cache a single file instead? + # Should we use SHA-1 instead of MD5? # ####################################################################### @@ -227,8 +265,14 @@ conditional_get () { return 4 fi - cached_header_file="$cache_dir/${hash}_headers" - cached_content_file="$cache_dir/${hash}_content" + # use distinct cache filenames for compressed mode + if $compressed_mode; then + cached_header_file="$cache_dir/${hash}_headers_compressed" + cached_content_file="$cache_dir/${hash}_content_compressed" + else + cached_header_file="$cache_dir/${hash}_headers" + cached_content_file="$cache_dir/${hash}_content" + fi print_log_message -D "$FUNCNAME using cached header file: $cached_header_file" print_log_message -D "$FUNCNAME using cached content file: $cached_content_file" @@ -283,6 +327,7 @@ conditional_get () { ####################################################################### # # GET the web resource + # If option -I was used, issue HEAD request instead # # This implementation issues an HTTP Conditional GET request iff # the resource is cached. @@ -297,15 +342,21 @@ conditional_get () { fi curl_opts="${curl_opts} --user-agent '${user_agent_string}'" + # set curl --compressed option if necessary + if $compressed_mode; then + adjective="compressed " + curl_opts="${curl_opts} --compressed" + fi + # always capture the header in a file # capture the output iff the script issues a GET request curl_opts="${curl_opts} --dump-header '${tmp_header_file}'" if $headers_only_mode; then - print_log_message -I "$FUNCNAME issuing HEAD request for resource: $location" + print_log_message -I "$FUNCNAME issuing HEAD request for ${adjective}resource: $location" curl_opts="${curl_opts} --head" curl_opts="${curl_opts} --output '/dev/null'" else - print_log_message -I "$FUNCNAME issuing GET request for resource: $location" + print_log_message -I "$FUNCNAME issuing GET request for ${adjective}resource: $location" curl_opts="${curl_opts} --output '${tmp_content_file}'" fi curl_opts="${curl_opts} --stderr '${tmp_stderr_file}'" @@ -390,27 +441,36 @@ conditional_get () { if [ "$response_code" = "200" ]; then - # sanity check - declared_content_length=$( get_header_value "$tmp_header_file" 'Content-Length' ) - return_code=$? - if [ $return_code -ne 0 ]; then - print_log_message -E "$FUNCNAME: get_header_value failed (return code: $return_code)" - return 6 - fi + # compute the length of the downloaded content actual_content_length=$( /bin/cat "$tmp_content_file" \ | /usr/bin/wc -c \ | $_SED -e 's/^[ ]*//' -e 's/[ ]*$//' ) - if [ -n "$declared_content_length" ]; then - if [ "$declared_content_length" != "$actual_content_length" ]; then - print_log_message -E "$FUNCNAME failed content length check" - return 3 + return_code=$? + if [ $return_code -ne 0 ]; then + print_log_message -E "$FUNCNAME: length calculation failed (return code: $return_code)" + return 3 + fi + print_log_message -D "$FUNCNAME downloaded ${actual_content_length} bytes" + + # this sanity check is applied only if option -x was NOT used + if ! $compressed_mode; then + declared_content_length=$( get_header_value "$tmp_header_file" 'Content-Length' ) + return_code=$? + if [ $return_code -ne 0 ]; then + print_log_message -E "$FUNCNAME: get_header_value failed (return code: $return_code)" + return 6 + fi + if [ -n "$declared_content_length" ]; then + if [ "$declared_content_length" != "$actual_content_length" ]; then + print_log_message -E "$FUNCNAME failed content length check" + return 3 + fi + else + print_log_message -W "$FUNCNAME: Content-Length response header missing" fi - else - print_log_message -W "$FUNCNAME: Content-Length response header missing" fi - print_log_message -D "$FUNCNAME downloaded ${actual_content_length} bytes" if $do_conditional_get; then print_log_message -D "$FUNCNAME refreshing cache files" else