diff --git a/.DS_Store b/.DS_Store index 310749a..57ae208 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index bddf4e2..f059112 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,10 @@ -# ignore the data files +# ignore the data files from 01-basic-deployment 01-basic-deployment/scripts/data/combined_data_cleaned.csv 01-basic-deployment/scripts/data/combined_data.csv -# ignore the done files +# ignore the done files from 01-basic-deployment 01-basic-deployment/scripts/extract_done -01-basic-deployment/scripts/transform_done \ No newline at end of file +01-basic-deployment/scripts/transform_done + +# ignore the data files from 02-advanced-deployment +02-advanced-deployment/docker/data/* diff --git a/02-advanced-deployment/docker/Dockerfile b/02-advanced-deployment/docker/Dockerfile new file mode 100644 index 0000000..6128e94 --- /dev/null +++ b/02-advanced-deployment/docker/Dockerfile @@ -0,0 +1,21 @@ +# Use an official Python runtime as a parent image +FROM python:3.9-slim + +# Set the working directory +WORKDIR /usr/src/app + +# Install required libraries +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the current directory contents into the container at /usr/src/app +COPY . . + +# Define environment variable +ENV OUTPUT_DIR=data + +# Make port 80 available to the world outside this container +EXPOSE 80 + +# Run script when the container launches +CMD ["python", "./extract_and_transform.py"] diff --git a/02-advanced-deployment/docker/docker-compose.yml b/02-advanced-deployment/docker/docker-compose.yml new file mode 100644 index 0000000..cd969e0 --- /dev/null +++ b/02-advanced-deployment/docker/docker-compose.yml @@ -0,0 +1,28 @@ +version: '3' +services: + postgres: + image: postgres:13 + environment: + POSTGRES_USER: your_user + POSTGRES_PASSWORD: your_password + POSTGRES_DB: weather_data + ports: + - "5432:5432" + volumes: + - pgdata:/var/lib/postgresql/data + app: + build: . + depends_on: + - postgres + environment: + - DB_NAME=weather_data + - DB_USER=your_user + - DB_PASSWORD=your_password + - DB_HOST=postgres + - DB_PORT=5432 + volumes: + - .:/usr/src/app + command: python extract_and_transform.py + +volumes: + pgdata: diff --git a/02-advanced-deployment/docker/extract_and_transform.py b/02-advanced-deployment/docker/extract_and_transform.py new file mode 100644 index 0000000..5ebbb1b --- /dev/null +++ b/02-advanced-deployment/docker/extract_and_transform.py @@ -0,0 +1,87 @@ +import boto3 +from botocore.config import Config +from botocore import UNSIGNED +import os +import pandas as pd +import glob +from sqlalchemy import create_engine, text + +def extract_noaa_gsod_data(year, month, output_dir='data'): + s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) + bucket_name = 'noaa-gsod-pds' + prefix = f'{year}/{str(month).zfill(2)}' + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # List objects in the bucket for the specified month + response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) + + if 'Contents' not in response: + print(f"No files found for {prefix}") + return + + for obj in response['Contents']: + key = obj['Key'] + local_path = os.path.join(output_dir, os.path.basename(key)) + + # Download the file + s3.download_file(bucket_name, key, local_path) + print(f'Downloaded {key} to {local_path}') + +def transform_and_load_to_postgres(input_dir='data', db_name='weather_data'): + db_user = os.getenv('DB_USER', 'your_user') + db_password = os.getenv('DB_PASSWORD', 'your_password') + db_host = os.getenv('DB_HOST', 'postgres') # Ensure this is 'postgres' in Docker + db_port = os.getenv('DB_PORT', '5432') + + # Create SQLAlchemy engine + engine = create_engine(f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}') + + # Ensure the 'weather' table exists + with engine.connect() as conn: + conn.execute(text(''' + CREATE TABLE IF NOT EXISTS weather ( + station TEXT, + date TEXT, + latitude REAL, + longitude REAL, + elevation REAL, + name TEXT, + temp REAL, + temp_attributes TEXT, + dewp REAL, + dewp_attributes TEXT, + slp REAL, + slp_attributes TEXT, + stp REAL, + stp_attributes TEXT, + visib REAL, + visib_attributes TEXT, + wdsp REAL, + wdsp_attributes TEXT, + mxspd REAL, + gust REAL, + max REAL, + max_attributes TEXT, + min REAL, + min_attributes TEXT, + prcp REAL, + prcp_attributes TEXT, + sndp REAL, + frshtt TEXT + ) + ''')) + + # Process each CSV file in the input directory + for file_path in glob.glob(f'{input_dir}/*.csv'): + print(f'Processing {file_path}') + df = pd.read_csv(file_path) + + # Insert data into PostgreSQL table + df.to_sql('weather', engine, if_exists='append', index=False) + print(f'Loaded {file_path} into database') + +if __name__ == '__main__': + extract_noaa_gsod_data(2020, 1) + transform_and_load_to_postgres() \ No newline at end of file diff --git a/02-advanced-deployment/docker/requirements.txt b/02-advanced-deployment/docker/requirements.txt new file mode 100644 index 0000000..179a9e5 --- /dev/null +++ b/02-advanced-deployment/docker/requirements.txt @@ -0,0 +1,6 @@ +boto3==1.26.0 +botocore==1.29.0 +pandas==2.0.0 +numpy==1.25.0 +sqlalchemy==2.0.0 +psycopg2-binary==2.9.6