From 5d7f339fcf988ae0e630b3559773a73ff3c806bd Mon Sep 17 00:00:00 2001 From: Timothy Middelkoop Date: Thu, 6 May 2021 10:04:13 -0500 Subject: [PATCH] Basic slurm-gcp imported from standalone exmaple --- .gitignore | 5 + slurm-gcp/Makefile | 10 ++ slurm-gcp/basic.tfvars | 155 +++++++++++++++++++++++ slurm-gcp/io.tf | 279 +++++++++++++++++++++++++++++++++++++++++ slurm-gcp/main.tf | 119 ++++++++++++++++++ slurm-gcp/versions.tf | 10 ++ 6 files changed, 578 insertions(+) create mode 100644 .gitignore create mode 100644 slurm-gcp/Makefile create mode 100644 slurm-gcp/basic.tfvars create mode 100644 slurm-gcp/io.tf create mode 100644 slurm-gcp/main.tf create mode 100644 slurm-gcp/versions.tf diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aa4c729 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +terraform.tfstate +terraform.tfstate.backup +.terraform.lock.hcl +credentials.gcp +local* diff --git a/slurm-gcp/Makefile b/slurm-gcp/Makefile new file mode 100644 index 0000000..b49e868 --- /dev/null +++ b/slurm-gcp/Makefile @@ -0,0 +1,10 @@ +.PHONY: plan apply destroy + +plan: + terraform plan -var-file=basic.tfvars -out terraform.tfplan + +apply: + terraform apply -var-file=basic.tfvars -auto-approve + +destroy: + terraform destroy -var-file=basic.tfvars -auto-approve diff --git a/slurm-gcp/basic.tfvars b/slurm-gcp/basic.tfvars new file mode 100644 index 0000000..def6575 --- /dev/null +++ b/slurm-gcp/basic.tfvars @@ -0,0 +1,155 @@ +cluster_name = "class" +zone = "us-central1-c" # Also defined in partitions + +# network_name = "" +# subnetwork_name = "" +# shared_vpc_host_project = "" + +# disable_controller_public_ips = true +disable_login_public_ips = false +# disable_compute_public_ips = true + +# suspend_time = 300 + +controller_machine_type = "n1-standard-2" +controller_image = "projects/schedmd-slurm-public/global/images/family/schedmd-slurm-20-11-4-hpc-centos-7" +controller_disk_type = "pd-standard" +controller_disk_size_gb = 50 +# controller_labels = { +# key1 = "val1" +# key2 = "val2" +# } +# controller_service_account = "default" +# controller_scopes = ["https://www.googleapis.com/auth/cloud-platform"] +# cloudsql = { +# server_ip = "" +# user = "slurm" +# password = "verysecure" +# db_name = "slurm_accounting" +# } +# controller_secondary_disk = false +# controller_secondary_disk_size = 100 +# controller_secondary_disk_type = "pd-ssd" +# +# When specifying an instance template, specified controller fields will +# override the template properites. +# controller_instance_template = null + +login_machine_type = "n1-standard-2" +login_image = "projects/schedmd-slurm-public/global/images/family/schedmd-slurm-20-11-4-hpc-centos-7" +login_disk_type = "pd-standard" +login_disk_size_gb = 20 +# login_labels = { +# key1 = "val1" +# key2 = "val2" +# } +# login_node_count = 1 +# login_node_service_account = "default" +# login_node_scopes = [ +# "https://www.googleapis.com/auth/monitoring.write", +# "https://www.googleapis.com/auth/logging.write" +# ] +# +# When specifying an instance template, specified login fields will +# override the template properites. +# login_instance_template = null + +# Optional network storage fields +# network_storage is mounted on all instances +# login_network_storage is mounted on controller and login instances +# network_storage = [{ +# server_ip = "" +# remote_mount = "/home" +# local_mount = "/home" +# fs_type = "nfs" +# mount_options = null +# }] +# +# login_network_storage = [{ +# server_ip = "" +# remote_mount = "/net_storage" +# local_mount = "/shared" +# fs_type = "nfs" +# mount_options = null +# }] + +# compute_node_service_account = "default" +# compute_node_scopes = [ +# "https://www.googleapis.com/auth/monitoring.write", +# "https://www.googleapis.com/auth/logging.write" +# ] + +partitions = [ + { name = "debug" + machine_type = "n1-standard-2" + static_node_count = 0 + max_node_count = 10 + zone = "us-central1-c" + image = "projects/schedmd-slurm-public/global/images/family/schedmd-slurm-20-11-4-hpc-centos-7" + image_hyperthreads = false + compute_disk_type = "pd-standard" + compute_disk_size_gb = 10 + compute_labels = {} + cpu_platform = null + gpu_count = 0 + gpu_type = null + network_storage = [] + preemptible_bursting = false + vpc_subnet = null + exclusive = false + enable_placement = false + regional_capacity = false + regional_policy = {} + instance_template = null + }, + # { name = "partition2" + # machine_type = "n1-standard-16" + # static_node_count = 0 + # max_node_count = 20 + # zone = "us-west1-b" + # image = "projects/slurm-184304/global/images/family/schedmd-slurm-20-11-3-hpc-centos-7" + # image_hyperthreads = false + # + # compute_disk_type = "pd-ssd" + # compute_disk_size_gb = 20 + # compute_labels = { + # key1 = "val1" + # key2 = "val2" + # } + # cpu_platform = "Intel Skylake" + # gpu_count = 8 + # gpu_type = "nvidia-tesla-v100" + # network_storage = [{ + # server_ip = "none" + # remote_mount = "" + # local_mount = "/data" + # fs_type = "gcsfuse" + # mount_options = "file_mode=664,dir_mode=775,allow_other" + # }] + # preemptible_bursting = true + # vpc_subnet = null + # exclusive = false + # enable_placement = false + # + # ### NOTE #### + # # regional_capacity is under development. You may see slowness in + # # deleting lots of instances. + # # + # # With regional_capacity : True, the region can be specified in the zone. + # # Otherwise the region will be inferred from the zone. + # zone = "us-west1" + # regional_capacity = True + # # Optional + # regional_policy = { + # locations = { + # "zones/us-west1-a" = { + # preference = "DENY" + # } + # } + # } + # + # When specifying an instance template, specified compute fields will + # override the template properites. + # instance_template = "my-template" +] + diff --git a/slurm-gcp/io.tf b/slurm-gcp/io.tf new file mode 100644 index 0000000..207b911 --- /dev/null +++ b/slurm-gcp/io.tf @@ -0,0 +1,279 @@ +# +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "cloudsql" { + description = "Define an existing CloudSQL instance to use instead of instance-local MySQL" + type = object({ + server_ip = string, + user = string, + password = string, + db_name = string }) + default = null +} + +variable "cluster_name" { + description = "Name of the cluster" + type = string +} + +variable "compute_node_scopes" { + description = "Scopes to apply to compute nodes." + type = list(string) + default = [ + "https://www.googleapis.com/auth/monitoring.write", + "https://www.googleapis.com/auth/logging.write" + ] +} + +variable "compute_node_service_account" { + description = "Service Account for compute nodes." + type = string + default = null +} + +variable "controller_machine_type" { + description = "Machine type to use for the controller instance" + type = string + default = null +} + +variable "controller_disk_type" { + description = "Disk type (pd-ssd or pd-standard) for controller." + type = string + default = null +} + +variable "controller_image" { + description = "Slurm image to use for the controller instance" + type = string + default = null +} + +variable "controller_instance_template" { + description = "Instance template to use to create controller instance" + type = string + default = null +} + +variable "controller_disk_size_gb" { + description = "Size of disk for the controller." + type = number + default = null +} + +variable "controller_labels" { + description = "Labels to add to controller instance. List of key key, value pairs." + type = any + default = null +} + +variable "controller_secondary_disk" { + description = "Create secondary disk mounted to controller node" + type = bool + default = false +} + +variable "controller_secondary_disk_size" { + description = "Size of disk for the secondary disk" + default = 100 +} + +variable "controller_secondary_disk_type" { + description = "Disk type (pd-ssd or pd-standard) for secondary disk" + default = "pd-ssd" +} + +variable "controller_scopes" { + description = "Scopes to apply to the controller" + type = list(string) + default = ["https://www.googleapis.com/auth/cloud-platform"] +} + +variable "controller_service_account" { + description = "Service Account for the controller" + type = string + default = null +} + +variable "disable_login_public_ips" { + type = bool + default = true +} + +variable "disable_controller_public_ips" { + type = bool + default = true +} + +variable "disable_compute_public_ips" { + type = bool + default = true +} + +variable "login_disk_type" { + description = "Disk type (pd-ssd or pd-standard) for login nodes." + type = string + default = null +} + +variable "login_disk_size_gb" { + description = "Size of disk for login nodes." + type = number + default = null +} + +variable "login_image" { + description = "Slurm image to use for login instances" + type = string + default = null +} + +variable "login_instance_template" { + description = "Instance template to use to creating login instances" + type = string + default = null +} + +variable "login_labels" { + description = "Labels to add to login instances. List of key key, value pairs." + type = any + default = null +} + +variable "login_machine_type" { + description = "Machine type to use for login node instances." + type = string + default = null +} + +variable "login_network_storage" { + description = "An array of network attached storage mounts to be configured on the login and controller instances." + type = list(object({ + server_ip = string, + remote_mount = string, + local_mount = string, + fs_type = string, + mount_options = string })) + default = [] +} + +variable "login_node_scopes" { + description = "Scopes to apply to login nodes." + type = list(string) + default = [ + "https://www.googleapis.com/auth/monitoring.write", + "https://www.googleapis.com/auth/logging.write" + ] +} + +variable "login_node_service_account" { + description = "Service Account for compute nodes." + type = string + default = null +} + +variable "login_node_count" { + description = "Number of login nodes in the cluster" + default = 1 +} + +variable "munge_key" { + description = "Specific munge key to use" + default = null +} + +variable "jwt_key" { + description = "Specific libjwt key to use" + default = null +} + +variable "network_name" { + default = null + type = string +} + +variable "network_storage" { + description = " An array of network attached storage mounts to be configured on all instances." + type = list(object({ + server_ip = string, + remote_mount = string, + local_mount = string, + fs_type = string, + mount_options = string })) + default = [] +} + +variable "partitions" { + description = "An array of configurations for specifying multiple machine types residing in their own Slurm partitions." + type = list(object({ + name = string, + machine_type = string, + max_node_count = number, + zone = string, + image = string, + image_hyperthreads = bool, + compute_disk_type = string, + compute_disk_size_gb = number, + compute_labels = any, + cpu_platform = string, + gpu_type = string, + gpu_count = number, + network_storage = list(object({ + server_ip = string, + remote_mount = string, + local_mount = string, + fs_type = string, + mount_options = string })), + preemptible_bursting = bool, + vpc_subnet = string, + exclusive = bool, + enable_placement = bool, + regional_capacity = bool, + regional_policy = any, + instance_template = string, + static_node_count = number })) +} + +variable "project" { + type = string +} + +variable "shared_vpc_host_project" { + type = string + default = null +} + +variable "subnetwork_name" { + description = "The name of the pre-defined VPC subnet you want the nodes to attach to based on Region." + default = null + type = string +} + +variable "suspend_time" { + description = "Idle time (in sec) to wait before nodes go away" + default = 300 +} + +variable "zone" { + type = string +} + +output "controller_network_ips" { + value = module.slurm_cluster_controller.instance_network_ips +} + +output "login_network_ips" { + value = module.slurm_cluster_login.instance_network_ips +} diff --git a/slurm-gcp/main.tf b/slurm-gcp/main.tf new file mode 100644 index 0000000..0a0178f --- /dev/null +++ b/slurm-gcp/main.tf @@ -0,0 +1,119 @@ +# +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + region = join("-", slice(split("-", var.zone), 0, 2)) +} + +provider "google" { + project = var.project + region = local.region + credentials = file("credentials.gcp") +} + +module "slurm_cluster_network" { + source = "../../modules/network" + + cluster_name = var.cluster_name + disable_login_public_ips = var.disable_login_public_ips + disable_controller_public_ips = var.disable_controller_public_ips + disable_compute_public_ips = var.disable_compute_public_ips + network_name = var.network_name + partitions = var.partitions + shared_vpc_host_project = var.shared_vpc_host_project + subnetwork_name = var.subnetwork_name + + project = var.project + region = local.region +} + +module "slurm_cluster_controller" { + source = "../../modules/controller" + + boot_disk_size = var.controller_disk_size_gb + boot_disk_type = var.controller_disk_type + image = var.controller_image + instance_template = var.controller_instance_template + cluster_name = var.cluster_name + compute_node_scopes = var.compute_node_scopes + compute_node_service_account = var.compute_node_service_account + disable_compute_public_ips = var.disable_compute_public_ips + disable_controller_public_ips = var.disable_controller_public_ips + labels = var.controller_labels + login_network_storage = var.login_network_storage + login_node_count = var.login_node_count + machine_type = var.controller_machine_type + munge_key = var.munge_key + jwt_key = var.jwt_key + network_storage = var.network_storage + partitions = var.partitions + project = var.project + region = local.region + secondary_disk = var.controller_secondary_disk + secondary_disk_size = var.controller_secondary_disk_size + secondary_disk_type = var.controller_secondary_disk_type + shared_vpc_host_project = var.shared_vpc_host_project + scopes = var.controller_scopes + service_account = var.controller_service_account + subnet_depend = module.slurm_cluster_network.subnet_depend + subnetwork_name = var.subnetwork_name + suspend_time = var.suspend_time + zone = var.zone +} + +module "slurm_cluster_login" { + source = "../../modules/login" + + boot_disk_size = var.login_disk_size_gb + boot_disk_type = var.login_disk_type + image = var.login_image + instance_template = var.login_instance_template + cluster_name = var.cluster_name + controller_name = module.slurm_cluster_controller.controller_node_name + controller_secondary_disk = var.controller_secondary_disk + disable_login_public_ips = var.disable_login_public_ips + labels = var.login_labels + login_network_storage = var.login_network_storage + machine_type = var.login_machine_type + node_count = var.login_node_count + region = local.region + scopes = var.login_node_scopes + service_account = var.login_node_service_account + munge_key = var.munge_key + network_storage = var.network_storage + shared_vpc_host_project = var.shared_vpc_host_project + subnet_depend = module.slurm_cluster_network.subnet_depend + subnetwork_name = var.subnetwork_name + zone = var.zone +} + +module "slurm_cluster_compute" { + source = "../../modules/compute" + + cluster_name = var.cluster_name + controller_name = module.slurm_cluster_controller.controller_node_name + disable_compute_public_ips = var.disable_compute_public_ips + network_storage = var.network_storage + partitions = var.partitions + project = var.project + region = local.region + scopes = var.compute_node_scopes + service_account = var.compute_node_service_account + shared_vpc_host_project = var.shared_vpc_host_project + subnet_depend = module.slurm_cluster_network.subnet_depend + subnetwork_name = var.subnetwork_name + zone = var.zone +} + diff --git a/slurm-gcp/versions.tf b/slurm-gcp/versions.tf new file mode 100644 index 0000000..627f6a2 --- /dev/null +++ b/slurm-gcp/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "~> 3.0" + } + } + + required_version = ">= 0.12.20" +}