From 41a6e2e6bdb8b4199dd98d1c1e4ec5f27d8e5ddd Mon Sep 17 00:00:00 2001 From: Tom Scavo Date: Tue, 18 Oct 2016 09:27:26 -0400 Subject: [PATCH] Refactor conditional_get. --- bin/cget.sh | 87 ++++++---- install.sh | 1 - lib/conditional_get.sh | 32 ++-- lib/http_tools.sh | 365 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 437 insertions(+), 48 deletions(-) diff --git a/bin/cget.sh b/bin/cget.sh index dd8c500..3ea5aac 100755 --- a/bin/cget.sh +++ b/bin/cget.sh @@ -25,16 +25,16 @@ display_help () { This script retrieves and caches HTTP resources on disk. A previously cached resource is retrieved via HTTP Conditional GET [RFC 7232]. If the web server responds with HTTP 200 OK, - the resource is cached and written to stdout. OTOH, if the web - server responds with 304 Not Modified, the cache is refreshed - but no output. + the resource is cached and written to stdout. If the web + server responds with 304 Not Modified, the cached resource + is output instead. Usage: ${0##*/} [-hv] [-F | -C] URL This script takes a single command-line argument. The URL - argument is the absolute URL of an HTTP resource. By default, - the script requests the resource at the given URL using the - curl command-line tool. + argument is the absolute URL of an HTTP resource. The script + requests the resource at the given URL using the curl + command-line tool. This script requires two environment variables. CACHE_DIR is the absolute path to the cache directory (which may or may not @@ -42,7 +42,7 @@ display_help () { helper scripts. Options: - -h Display this message + -h Display this help message -v Write verbose messages to stdout -F Enables "Force Output Mode" -C Enables "Cache Only Mode" @@ -50,11 +50,14 @@ display_help () { Option -h is mutually exclusive of all other options. The default behavior of the script may be modified by using - option -F or -C, which are mutually exclusive. Force Output Mode - outputs the response body even if the server response is 304. - Cache Only Mode bypasses the GET request altogether and goes - directly to cache. If the resource resides in cache, it is - output on stdout, otherwise an error is thrown. + option -F or -C, which are mutually exclusive. Force Output + Mode (option -F) forces the return of a fresh resource. The + resource is output on stdout if and only if the server + responds with 200. If the response is 304, an error is thrown. + + Cache Only Mode (option -C) bypasses the GET request altogether + and goes directly to cache. If the resource resides in cache, + it is output on stdout, otherwise an error is thrown. LIBRARY @@ -66,9 +69,10 @@ display_help () { EXAMPLES - ${0##*/} URL # Retrieve the resource using HTTP conditional GET - ${0##*/} -F URL # Enable Force Output Mode - ${0##*/} -C URL # Enable Cache Only Mode + url=http://md.incommon.org/InCommon/InCommon-metadata.xml + ${0##*/} \$url # Retrieve the resource using HTTP conditional GET + ${0##*/} -F \$url # Enable Force Output Mode + ${0##*/} -C \$url # Enable Cache Only Mode HELP_MSG } @@ -81,8 +85,7 @@ script_name=${0##*/} # equivalent to basename $0 # library filenames (always list command_paths first) LIB_FILENAMES="command_paths.sh compatible_mktemp.sh -http_tools.sh -conditional_get.sh" +http_tools.sh" ####################################################################### # Process command-line options and arguments @@ -182,34 +185,46 @@ for lib_filename in $LIB_FILENAMES; do fi done -# create a temporary directory -tmp_dir=$( make_temp_file -d ) -if [ ! -d "$tmp_dir" ] ; then - printf "ERROR: $script_name unable to create temporary dir\n" >&2 - exit 2 +# determine temporary directory +if [ -n "$TMPDIR" ] && [ -d "$TMPDIR" ]; then + # use system temporary directory (remove trailing slash) + TMP_DIR="${TMPDIR%%/}/${script_name%%.*}" + $verbose_mode && printf "$script_name using temp dir: %s\n" "$TMP_DIR" +else + # create temporary directory + tmp_dir="$( make_temp_file -d )" + if [ ! -d "$tmp_dir" ] ; then + printf "ERROR: $script_name unable to create temporary dir\n" >&2 + exit 2 + fi + # use temporary directory (remove trailing slash) + TMP_DIR="${tmp_dir%%/}/${script_name%%.*}" + $verbose_mode && printf "$script_name creating temp dir: %s\n" "$TMP_DIR" +fi +if [ ! -d "$TMP_DIR" ]; then + /bin/mkdir "$TMP_DIR" + exit_status=$? + if [ $exit_status -ne 0 ]; then + echo "ERROR: $script_name failed to create dir: $TMP_DIR" >&2 + exit $exit_status + fi fi -$verbose_mode && printf "$script_name creating temp dir: %s\n" "$tmp_dir" + +# temporary file +tmp_file="${TMP_DIR}/http_resource_$$.xml" ####################################################################### # Main processing ####################################################################### -# invoke the function -response_body=$( - conditional_get $local_opts -d "$CACHE_DIR" -t "$tmp_dir" "$location" -) +# get the resource +conditional_get $local_opts -d "$CACHE_DIR" -t "$TMP_DIR" "$location" > "$tmp_file" exit_code=$? if [ $exit_code -ne 0 ]; then - echo "ERROR: ${script_name} failed to get resource: $location" >&2 + echo "ERROR: $script_name failed to get resource: $location" >&2 + printf "See output log: %s\n" "$TMP_DIR/$conditional_get_log" >&2 exit $exit_code fi -if $verbose_mode; then - /bin/cat "$tmp_dir/conditional_get_log" -else - if [ -n "$response_body" ]; then - echo "$response_body" - fi -fi - +/bin/cat "$tmp_file" exit 0 diff --git a/install.sh b/install.sh index 53d68e0..b09d1fb 100755 --- a/install.sh +++ b/install.sh @@ -108,7 +108,6 @@ done <&2 + return 1 fi + /bin/cat "${cached_content_file}" return 0 } diff --git a/lib/http_tools.sh b/lib/http_tools.sh index 4f49fa3..813071c 100755 --- a/lib/http_tools.sh +++ b/lib/http_tools.sh @@ -17,10 +17,373 @@ ####################################################################### # has this file already been sourced? -if [ "$(type -t get_response_code)" = function ]; then +if [ "$(type -t conditional_get)" = function ]; then return 0 fi +####################################################################### +# +# Given a web resource and a cache, if the resource is cached, request +# the resource using HTTP Conditional GET [RFC 7232], otherwise issue +# an ordinary GET request. In either case, if the server responds with +# 200, cache the resource and return the response body. If the server +# responds with 304, return the cached resource instead. +# +# Usage: conditional_get [-v] [-F | -C] -d CACHE_DIR -t TMP_DIR HTTP_LOCATION +# +# This function requires two option arguments (CACHE_DIR and TMP_DIR) +# and a command-line argument (HTTP_LOCATION). The rest of the command +# line is optional. +# +# Options: +# -v verbose mode +# -F force the return of a fresh resource +# -C check the cache only +# -d the cache directory (REQUIRED) +# -t a temporary directory (REQUIRED) +# +# Use option -F or -C to alter the default behavior of the function. +# +# Option -F forces the return of a fresh resource, that is, if the +# server responds with 304, an error occurs and the function returns +# with a nonzero return code. +# +# Option -C causes the function to go directly to cache. No GET request +# is issued. This option is useful in offline mode. +# +# The output of the curl command-line tool is stored in the following +# temporary files: +# +# $TMP_DIR/conditional_get_curl_headers +# $TMP_DIR/conditional_get_curl_content +# $TMP_DIR/conditional_get_curl_stderr +# +# This function requires the following library file: +# +# command_paths.sh +# +# This library file must be sourced BEFORE calling this function. +# +# TODO: +# - follow redirects? +# +####################################################################### + +conditional_get () { + + if ! $COMMAND_PATHS; then + echo "ERROR: global command paths not found" >&2 + return 2 + fi + + local script_version="0.6" + local user_agent_string="HTTP Conditional GET client $script_version" + + local hash + local exit_code + local cached_header_file + local cached_content_file + local conditional_get_mode + local tmp_header_file + local tmp_content_file + local tmp_stderr_file + local curl_opts + local do_conditional_get + local header_value + local cmd + local response_code + local declared_content_length + local actual_content_length + + local verbose_mode=false + local force_output_mode=false + local cache_only_mode=false + local cache_dir + local tmp_dir + local tmp_log_file + local location + + local opt + local OPTARG + local OPTIND + while getopts ":vFCd:t:" opt; do + case $opt in + v) + verbose_mode=true + ;; + F) + force_output_mode=true + cache_only_mode=false + ;; + C) + cache_only_mode=true + force_output_mode=false + ;; + d) + cache_dir="$OPTARG" + ;; + t) + tmp_dir="$OPTARG" + ;; + \?) + echo "ERROR: $FUNCNAME: Unrecognized option: -$OPTARG" >&2 + return 2 + ;; + :) + echo "ERROR: $FUNCNAME: Option -$OPTARG requires an argument" >&2 + return 2 + ;; + esac + done + + # a temporary directory is required + if [ -z "$tmp_dir" ]; then + echo "ERROR: $FUNCNAME: no temporary directory specified" >&2 + return 2 + fi + if [ ! -d "$tmp_dir" ]; then + echo "ERROR: $FUNCNAME: directory does not exist: $tmp_dir" >&2 + return 2 + fi + tmp_log_file="$tmp_dir/${FUNCNAME}_log" + $verbose_mode && echo "$FUNCNAME using temporary directory $tmp_dir" > "$tmp_log_file" + + # a cache directory is required + if [ -z "$cache_dir" ]; then + echo "ERROR: $FUNCNAME: no cache directory specified" >&2 + return 2 + fi + if [ ! -d "$cache_dir" ]; then + echo "ERROR: $FUNCNAME: directory does not exist: $cache_dir" >&2 + return 2 + fi + $verbose_mode && echo "$FUNCNAME using cache directory $cache_dir" >> "$tmp_log_file" + + # determine the URL location + shift $(( OPTIND - 1 )) + if [ $# -ne 1 ]; then + echo "ERROR: $FUNCNAME: wrong number of arguments: $# (1 required)" >&2 + return 2 + fi + location="$1" + if [ -z "$location" ] ; then + echo "ERROR: $FUNCNAME: empty URL argument" >&2 + return 2 + fi + $verbose_mode && echo "$FUNCNAME using location $location" >> "$tmp_log_file" + + ####################################################################### + # + # Determine the cache files (which may or may not exist at this point) + # + # This cache implementation uses separate files for the header and + # body content. Does it make sense to cache a single file instead? + # Also, should we use SHA-1 instead of MD5? + # + ####################################################################### + + hash=$( echo -n "$location" \ + | /usr/bin/openssl dgst -md5 -hex \ + | $_CUT -d' ' -f2 + ) + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "ERROR: $FUNCNAME failed to hash the location URL" >&2 + return $exit_code + fi + + cached_header_file="$cache_dir/${hash}_headers" + cached_content_file="$cache_dir/${hash}_content" + + if $verbose_mode; then + echo "$FUNCNAME using cached header file: ${cached_header_file}" >> "$tmp_log_file" + echo "$FUNCNAME using cached content file: ${cached_content_file}" >> "$tmp_log_file" + fi + + # check if the resource is cached + if [ -f "$cached_header_file" ] && [ -f "$cached_content_file" ]; then + if $cache_only_mode; then + /bin/cat "$cached_content_file" + return 0 + fi + conditional_get_mode=true + else + # ensure cache integrity + /bin/rm -f "$cached_header_file" "$cached_content_file" >&2 + if $cache_only_mode; then + echo "ERROR: $FUNCNAME failed to find cached resource: $location" >&2 + return 1 + fi + conditional_get_mode=false + fi + + ####################################################################### + # + # Initialization + # + ####################################################################### + + tmp_header_file="$tmp_dir/${FUNCNAME}_curl_headers" + tmp_content_file="$tmp_dir/${FUNCNAME}_curl_content" + tmp_stderr_file="$tmp_dir/${FUNCNAME}_curl_stderr" + + if $verbose_mode; then + echo "$FUNCNAME using temp header file: ${tmp_header_file}" >> "$tmp_log_file" + echo "$FUNCNAME using temp content file: ${tmp_content_file}" >> "$tmp_log_file" + echo "$FUNCNAME using temp stderr file: ${tmp_stderr_file}" >> "$tmp_log_file" + fi + + ####################################################################### + # + # GET the web resource + # + # This implementation issues an HTTP Conditional GET request iff + # the resource is cached. + # + ####################################################################### + + # init curl command-line options + if $verbose_mode; then + curl_opts="--verbose --progress-bar" + else + curl_opts="--silent --show-error" + fi + curl_opts="${curl_opts} --user-agent '${user_agent_string}'" + curl_opts="${curl_opts} --dump-header '${tmp_header_file}'" + curl_opts="${curl_opts} --output '${tmp_content_file}'" + curl_opts="${curl_opts} --stderr '${tmp_stderr_file}'" + + # If the resource is cached, add a conditional GET header. + # Since "A recipient MUST ignore If-Modified-Since if the + # request contains an If-None-Match header field," the + # latter takes precedence in the following code block. + do_conditional_get=false + if $conditional_get_mode; then + header_value=$( get_header_value "$cached_header_file" 'ETag' ) + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "ERROR: $FUNCNAME: get_header_value failed to initialize" >&2 + return $exit_code + fi + if [ -n "$header_value" ]; then + do_conditional_get=true + curl_opts="${curl_opts} --header 'If-None-Match: $header_value'" + else + header_value=$( get_header_value "$cached_header_file" 'Last-Modified' ) + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "ERROR: $FUNCNAME: get_header_value failed to initialize" >&2 + return $exit_code + fi + if [ -n "$header_value" ]; then + do_conditional_get=true + curl_opts="${curl_opts} --header 'If-Modified-Since: $header_value'" + fi + fi + fi + + # invoke curl + cmd="/usr/bin/curl $curl_opts $location" + $verbose_mode && printf "$FUNCNAME issuing curl command: %s\n" "$cmd" >> "$tmp_log_file" + eval $cmd + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "ERROR: $FUNCNAME: curl failed ($exit_code)" >&2 + return $exit_code + fi + + ####################################################################### + # + # Process the response + # + # This cache implementation always tries to cache a 200 response. + # What if the response contains a "no-store" cache directive? + # + ####################################################################### + + # sanity check + if [ ! -f "$tmp_header_file" ]; then + echo "ERROR: $FUNCNAME unable to find header file $tmp_header_file" >&2 + return 1 + fi + + response_code=$( get_response_code "$tmp_header_file" ) + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "ERROR: $FUNCNAME: get_response_code failed to initialize" >&2 + return $exit_code + fi + $verbose_mode && printf "$FUNCNAME received response code: %d\n" "$response_code" >> "$tmp_log_file" + + if [ "$response_code" = "200" ]; then + + # sanity check + declared_content_length=$( get_header_value "$tmp_header_file" 'Content-Length' ) + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "ERROR: $FUNCNAME: get_header_value failed to initialize" >&2 + return $exit_code + fi + actual_content_length=$( /bin/cat "$tmp_content_file" \ + | /usr/bin/wc -c \ + | $_SED -e 's/^[ ]*//' -e 's/[ ]*$//' + ) + if [ -n "$declared_content_length" ]; then + if [ "$declared_content_length" != "$actual_content_length" ]; then + echo "ERROR: $FUNCNAME failed content length check" >&2 + return 1 + fi + else + echo "WARNING: Content-Length response header missing" >&2 + fi + + if $verbose_mode; then + echo "$FUNCNAME downloaded ${actual_content_length} bytes" >> "$tmp_log_file" + if $do_conditional_get; then + echo "$FUNCNAME refreshing cache files" >> "$tmp_log_file" + else + echo "$FUNCNAME initializing cache files" >> "$tmp_log_file" + fi + fi + + # update the cache; maintain cache integrity at all times + /bin/cp -f "$tmp_header_file" "$cached_header_file" >&2 + exit_code=$? + if [ $exit_code -ne 0 ]; then + /bin/rm -f "$cached_header_file" "$cached_content_file" >&2 + echo "ERROR: $FUNCNAME failed copy to file $cached_header_file" >&2 + return $exit_code + fi + /bin/cp -f "$tmp_content_file" "$cached_content_file" >&2 + exit_code=$? + if [ $exit_code -ne 0 ]; then + /bin/rm -f "$cached_header_file" "$cached_content_file" >&2 + echo "ERROR: $FUNCNAME failed copy to file $cached_content_file" >&2 + return $exit_code + fi + + elif [ "$response_code" = "304" ]; then + $verbose_mode && echo "$FUNCNAME downloaded 0 bytes (cache is up-to-date)" >> "$tmp_log_file" + else + echo "ERROR: $FUNCNAME failed with HTTP response code $response_code" >&2 + return 1 + fi + + ####################################################################### + # + # output the resource content + # + ####################################################################### + + if $force_output_mode && [ "$response_code" = "304" ]; then + echo "ERROR: $FUNCNAME failed to get fresh resource: $location" >&2 + return 1 + fi + + /bin/cat "${cached_content_file}" + return 0 +} + ####################################################################### # # This function takes a file containing an HTTP response header and