From 61faaaaf2667f10b063fcb50e94f1cc294d2a672 Mon Sep 17 00:00:00 2001 From: melhindi Date: Sun, 30 Jun 2024 01:21:16 +0200 Subject: [PATCH 1/5] Replace terraform-inventory plugin and add support for spot instances Co-authored-by: toziegler Replace the outdated terrafrom-inventory plugin with the Ansible Provider and the Terraform Ansible-Collection Add a new variable to deploy instances as AWS spot instances --- driver-pulsar/README.md | 25 ++--- .../deploy/ssd/provision-pulsar-aws.tf | 99 ++++++++++++++++++- driver-pulsar/deploy/ssd/terraform.tfvars | 1 + driver-pulsar/deploy/ssd/terraform.yaml | 4 + 4 files changed, 112 insertions(+), 17 deletions(-) create mode 100644 driver-pulsar/deploy/ssd/terraform.yaml diff --git a/driver-pulsar/README.md b/driver-pulsar/README.md index 730b6f562..df26234aa 100644 --- a/driver-pulsar/README.md +++ b/driver-pulsar/README.md @@ -3,13 +3,16 @@ For instructions on running the OpenMessaging benchmarks for Pulsar, see the [official documentation](http://openmessaging.cloud/docs/benchmarks/pulsar/). ## Supplement to the official documentation - -Before you run `ansible-playbook` with `terraform-inventory`, you must set the environment variable `TF_STATE`. i.e. the completed command should be: +For Ansible to have access to the inventory defined in the Terraform configuration, the Terraform Collection for Ansible is required. +Install it with the following command ([source](https://mdawar.dev/blog/ansible-terraform-inventory)): +```bash +ansible-galaxy collection install cloud.terraform +``` ```bash -TF_STATE=. ansible-playbook \ +ansible-playbook \ --user ec2-user \ - --inventory `which terraform-inventory` \ + --inventory terraform.yaml \ deploy.yaml ``` @@ -20,19 +23,11 @@ The Ansible deployment script supports flexible configuration with a variable fi ```bash TF_STATE=. ansible-playbook \ --user ec2-user \ - --inventory `which terraform-inventory` \ + --inventory terraform.yaml \ -e @extra_vars.yaml \ deploy.yaml ``` -For example, if you changed the AWS instance type, the two SSD device paths might not be `/dev/nvme1n1` and `/dev/nvme2n1`. In this case, you can configure them like - -```yaml -disk_dev: - - /path/to/disk1 - - /path/to/disk2 -``` - See more explanations in [the example variable file](./deploy/ssd/extra_vars.yaml). ### Enable protocol handlers @@ -56,9 +51,9 @@ It will download KoP and MoP from the given URLs. Then, the configuration templa You can change the configuration files and then restart the cluster by executing the following command. ```bash -TF_STATE=. ansible-playbook \ +ansible-playbook \ --user ec2-user \ - --inventory `which terraform-inventory` \ + --inventory terraform.yaml \ -e @extra_vars.yaml \ restart-brokers.yaml ``` diff --git a/driver-pulsar/deploy/ssd/provision-pulsar-aws.tf b/driver-pulsar/deploy/ssd/provision-pulsar-aws.tf index 34e67700b..3ded96e31 100644 --- a/driver-pulsar/deploy/ssd/provision-pulsar-aws.tf +++ b/driver-pulsar/deploy/ssd/provision-pulsar-aws.tf @@ -2,12 +2,16 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 3.0" + version = "~> 5.56" } random = { source = "hashicorp/random" version = "3.1" } + ansible = { + source = "ansible/ansible" + version = "1.3.0" + } } } @@ -33,6 +37,7 @@ variable "key_name" { variable "region" {} variable "az" {} variable "ami" {} +variable "spot" {} variable "instance_types" {} variable "num_instances" {} @@ -129,7 +134,16 @@ resource "aws_instance" "zookeeper" { vpc_security_group_ids = [ aws_security_group.benchmark_security_group.id] count = var.num_instances["zookeeper"] - + dynamic "instance_market_options" { + for_each = var.spot ? [1] : [] + content { + market_type = "spot" + spot_options { + max_price = 0.5 + } + } + } + tags = { Name = "zk-${count.index}" } @@ -143,6 +157,15 @@ resource "aws_instance" "pulsar" { vpc_security_group_ids = [ aws_security_group.benchmark_security_group.id] count = var.num_instances["pulsar"] + dynamic "instance_market_options" { + for_each = var.spot ? [1] : [] + content { + market_type = "spot" + spot_options { + max_price = 0.7 + } + } + } tags = { Name = "pulsar-${count.index}" @@ -157,6 +180,15 @@ resource "aws_instance" "client" { vpc_security_group_ids = [ aws_security_group.benchmark_security_group.id] count = var.num_instances["client"] + dynamic "instance_market_options" { + for_each = var.spot ? [1] : [] + content { + market_type = "spot" + spot_options { + max_price = 0.9 + } + } + } tags = { Name = "pulsar-client-${count.index}" @@ -171,12 +203,75 @@ resource "aws_instance" "prometheus" { vpc_security_group_ids = [ aws_security_group.benchmark_security_group.id] count = var.num_instances["prometheus"] + dynamic "instance_market_options" { + for_each = var.spot ? [1] : [] + content { + market_type = "spot" + spot_options { + max_price = 0.09 + } + } + } tags = { Name = "prometheus-${count.index}" } } +# Inventory host resource. +resource "ansible_host" "zookeeper" { + name = "zk-${count.index}" + groups = ["zookeeper"] # Groups this host is part of. + count = var.num_instances["zookeeper"] + + variables = { + # Connection vars. + ansible_user = "ec2-user" # Default user depends on the OS. + ansible_host = aws_instance.zookeeper[count.index].public_ip + + # Custom vars that we might use in roles/tasks. + } +} +resource "ansible_host" "pulsar" { + name = "pulsar-${count.index}" + groups = ["pulsar"] # Groups this host is part of. + count = var.num_instances["pulsar"] + + variables = { + # Connection vars. + ansible_user = "ec2-user" # Default user depends on the OS. + ansible_host = aws_instance.pulsar[count.index].public_ip + + # Custom vars that we might use in roles/tasks. + } +} +resource "ansible_host" "client" { + name = "client-${count.index}" + groups = ["client"] # Groups this host is part of. + count = var.num_instances["client"] + + variables = { + # Connection vars. + ansible_user = "ec2-user" # Default user depends on the OS. + ansible_host = aws_instance.client[count.index].public_ip + + # Custom vars that we might use in roles/tasks. + } +} +resource "ansible_host" "prometheus" { + name = "prometheus-${count.index}" + groups = ["prometheus"] # Groups this host is part of. + count = var.num_instances["prometheus"] + + variables = { + # Connection vars. + ansible_user = "ec2-user" # Default user depends on the OS. + ansible_host = aws_instance.prometheus[count.index].public_ip + + # Custom vars that we might use in roles/tasks. + } +} + output "zookeeper" { value = { for instance in aws_instance.zookeeper : diff --git a/driver-pulsar/deploy/ssd/terraform.tfvars b/driver-pulsar/deploy/ssd/terraform.tfvars index 7164caa2d..d506d92e7 100644 --- a/driver-pulsar/deploy/ssd/terraform.tfvars +++ b/driver-pulsar/deploy/ssd/terraform.tfvars @@ -2,6 +2,7 @@ public_key_path = "~/.ssh/pulsar_aws.pub" region = "us-west-2" az = "us-west-2a" ami = "ami-08970fb2e5767e3b8" // RHEL-8 +spot = true instance_types = { "pulsar" = "i3en.6xlarge" diff --git a/driver-pulsar/deploy/ssd/terraform.yaml b/driver-pulsar/deploy/ssd/terraform.yaml new file mode 100644 index 000000000..79aeb15ef --- /dev/null +++ b/driver-pulsar/deploy/ssd/terraform.yaml @@ -0,0 +1,4 @@ +plugin: cloud.terraform.terraform_provider +project_path: . +# Terraform binary (available in the $PATH) or full path to the binary. +binary_path: terraform From fa8be97f01ffad25db654344cfc6ec4dc59a9200 Mon Sep 17 00:00:00 2001 From: melhindi Date: Wed, 3 Jul 2024 07:58:44 +0200 Subject: [PATCH 2/5] Replace hard coded storage devices with ansible discovery Co-authored-by: toziegler When deploying AWS instances, with every re-boot the label of NVMe devices changes. E.g., `/dev/nvme1n1`, `/dev/nvme0n1` Hence, hard-coding the disk labels leads to failed deployments when, e.g., `/dev/nvme1n1` is used as boot disk. By discovering storage devices with ansible we can dynamically select available storage devices for pulsar/bookkeeper --- driver-pulsar/deploy/ssd/deploy.yaml | 47 ++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/driver-pulsar/deploy/ssd/deploy.yaml b/driver-pulsar/deploy/ssd/deploy.yaml index 48a816dbb..191901ae9 100644 --- a/driver-pulsar/deploy/ssd/deploy.yaml +++ b/driver-pulsar/deploy/ssd/deploy.yaml @@ -26,10 +26,9 @@ pulsar_version: "{{ pulsar_version | default('2.11.0') }}" node_exporter_version: "{{ node_exporter_version | default('1.2.2') }}" prometheus_version: "{{ prometheus_version | default('2.31.1') }}" - disk_dev: "{{ disk_dev | default(['/dev/nvme1n1', '/dev/nvme2n1']) }}" - set_fact: pulsar_binary: - src: "https://downloads.apache.org/pulsar/pulsar-{{ pulsar_version }}/apache-pulsar-{{ pulsar_version }}-bin.tar.gz" + src: "https://archive.apache.org/dist/pulsar/pulsar-{{ pulsar_version }}/apache-pulsar-{{ pulsar_version }}-bin.tar.gz" remote: yes when: pulsar_binary is not defined - set_fact: @@ -48,13 +47,27 @@ connection: ssh become: true tasks: + - name: Initialize empty list for devices + set_fact: + storage_devices: [] + no_log: true + + - name: Get NVMe devices + set_fact: + storage_devices: "{{ storage_devices + ['/dev/' ~ item.key ] }}" + with_dict: "{{ ansible_devices }}" + when: "item.value.host.startswith('Non-Volatile memory controller:') and not item.value.partitions" + + - name: Show device names + debug: var=storage_devices + - name: Format disks filesystem: fstype: xfs dev: '{{ item }}' with_items: - - "{{ disk_dev[0] }}" - - "{{ disk_dev[1] }}" + - "{{ storage_devices[0] }}" + - "{{ storage_devices[1] }}" - name: Mount disks mount: path: "{{ item.path }}" @@ -63,21 +76,35 @@ opts: defaults,noatime,nodiscard state: mounted with_items: - - { path: "/mnt/zookeeper/logs", src: "{{ disk_dev[0] }}" } - - { path: "/mnt/zookeeper/data", src: "{{ disk_dev[1] }}" } + - { path: "/mnt/zookeeper/logs", src: "{{ storage_devices[0] }}" } + - { path: "/mnt/zookeeper/data", src: "{{ storage_devices[1] }}" } - name: Format and mount disks for Pulsar/BookKeeper hosts hosts: pulsar connection: ssh become: true tasks: + - name: Initialize empty list for devices + set_fact: + storage_devices: [] + no_log: true + + - name: Get NVMe devices + set_fact: + storage_devices: "{{ storage_devices + ['/dev/' ~ item.key ] }}" + with_dict: "{{ ansible_devices }}" + when: "item.value.host.startswith('Non-Volatile memory controller:') and not item.value.partitions" + + - name: Show device names + debug: var=storage_devices + - name: Format disks filesystem: fstype: xfs dev: '{{ item }}' with_items: - - "{{ disk_dev[0] }}" - - "{{ disk_dev[1] }}" + - "{{ storage_devices[0] }}" + - "{{ storage_devices[1] }}" - name: Mount disks mount: path: "{{ item.path }}" @@ -86,8 +113,8 @@ opts: defaults,noatime,nodiscard state: mounted with_items: - - { path: "/mnt/journal", src: "{{ disk_dev[0] }}" } - - { path: "/mnt/storage", src: "{{ disk_dev[1] }}" } + - { path: "/mnt/journal", src: "{{ storage_devices[0] }}" } + - { path: "/mnt/storage", src: "{{ storage_devices[1] }}" } - name: Install Node exporter on Brokers to collect system metrics hosts: pulsar From 90b15dce5bc0be4a0e782058b50afc9507d298a7 Mon Sep 17 00:00:00 2001 From: melhindi Date: Wed, 3 Jul 2024 08:03:54 +0200 Subject: [PATCH 3/5] Prevent uploading benchmark code multiple times Co-authored-by: toziegler When network connectivity is slow, uploading the benchmark code to nodes via ansible takes long and is not required if the step has been performed before. By splitting the copy and rename step and using `creates` we can prevent multiple (slow) uploads. --- driver-pulsar/deploy/ssd/deploy.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/driver-pulsar/deploy/ssd/deploy.yaml b/driver-pulsar/deploy/ssd/deploy.yaml index 191901ae9..53504572c 100644 --- a/driver-pulsar/deploy/ssd/deploy.yaml +++ b/driver-pulsar/deploy/ssd/deploy.yaml @@ -383,12 +383,16 @@ template: src: "templates/client.conf" dest: "/opt/pulsar/conf/client.conf" - - file: path=/opt/benchmark state=absent - name: Copy benchmark code unarchive: src: ../../../package/target/openmessaging-benchmark-0.0.1-SNAPSHOT-bin.tar.gz dest: /opt - - shell: mv /opt/openmessaging-benchmark-0.0.1-SNAPSHOT /opt/benchmark + creates: /opt/benchmark + + - name: Rename benchmark directory + ansible.builtin.shell: + cmd: mv /opt/openmessaging-benchmark-0.0.1-SNAPSHOT /opt/benchmark + creates: /opt/benchmark - template: src: "templates/workers.yaml" From 048eb2202b08eb8a8815def57144cc540fb94170 Mon Sep 17 00:00:00 2001 From: melhindi Date: Wed, 3 Jul 2024 22:25:53 +0200 Subject: [PATCH 4/5] Make on-demand instances default Co-authored-by: toziegler For long running benchmarks on-demand instances is a more meaningful default value --- driver-pulsar/deploy/ssd/terraform.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver-pulsar/deploy/ssd/terraform.tfvars b/driver-pulsar/deploy/ssd/terraform.tfvars index d506d92e7..1307bbbbb 100644 --- a/driver-pulsar/deploy/ssd/terraform.tfvars +++ b/driver-pulsar/deploy/ssd/terraform.tfvars @@ -2,7 +2,7 @@ public_key_path = "~/.ssh/pulsar_aws.pub" region = "us-west-2" az = "us-west-2a" ami = "ami-08970fb2e5767e3b8" // RHEL-8 -spot = true +spot = false instance_types = { "pulsar" = "i3en.6xlarge" From 185da0cceb1af782cb5f9a9b1e3c2d1dd3ef1b27 Mon Sep 17 00:00:00 2001 From: melhindi Date: Mon, 9 Sep 2024 16:28:05 +0200 Subject: [PATCH 5/5] Support ubuntu and centos images --- driver-pulsar/README.md | 1 + driver-pulsar/deploy/ssd/deploy.yaml | 60 ++++++++++--------- .../deploy/ssd/provision-pulsar-aws.tf | 20 ++++--- driver-pulsar/deploy/ssd/terraform.tfvars | 7 ++- 4 files changed, 49 insertions(+), 39 deletions(-) diff --git a/driver-pulsar/README.md b/driver-pulsar/README.md index df26234aa..017aab7d0 100644 --- a/driver-pulsar/README.md +++ b/driver-pulsar/README.md @@ -7,6 +7,7 @@ For Ansible to have access to the inventory defined in the Terraform configurati Install it with the following command ([source](https://mdawar.dev/blog/ansible-terraform-inventory)): ```bash ansible-galaxy collection install cloud.terraform +ansible-galaxy role install geerlingguy.docker ``` ```bash diff --git a/driver-pulsar/deploy/ssd/deploy.yaml b/driver-pulsar/deploy/ssd/deploy.yaml index 53504572c..13956bf19 100644 --- a/driver-pulsar/deploy/ssd/deploy.yaml +++ b/driver-pulsar/deploy/ssd/deploy.yaml @@ -56,7 +56,7 @@ set_fact: storage_devices: "{{ storage_devices + ['/dev/' ~ item.key ] }}" with_dict: "{{ ansible_devices }}" - when: "item.value.host.startswith('Non-Volatile memory controller:') and not item.value.partitions" + when: "item.key.startswith('nvme') and not item.value.partitions" - name: Show device names debug: var=storage_devices @@ -93,7 +93,7 @@ set_fact: storage_devices: "{{ storage_devices + ['/dev/' ~ item.key ] }}" with_dict: "{{ ansible_devices }}" - when: "item.value.host.startswith('Non-Volatile memory controller:') and not item.value.partitions" + when: "item.key.startswith('nvme') and not item.value.partitions" - name: Show device names debug: var=storage_devices @@ -179,19 +179,39 @@ connection: ssh become: true tasks: + - name: Install Tuned packages + ansible.builtin.package: + state: latest + update_cache: true + name: + - tuned + - name: Enable Tuned + service: name=tuned state=started enabled=yes - name: Set performance profile command: tuned-adm profile latency-performance - - name: Install RPM packages - yum: + - name: Install packages + ansible.builtin.package: state: latest - pkg: + name: - wget - - java-17-openjdk - - java-17-openjdk-devel - sysstat - vim - chrony - when: ansible_facts['distribution'] == 'RedHat' + - name: Install java on RedHat/CentOS + ansible.builtin.package: + state: latest + name: + - java-17-openjdk + - java-17-openjdk-devel + when: + - ansible_facts['distribution'] == 'RedHat' or ansible_facts['distribution'] == 'CentOS' + - name: Install java on Debain/Ubuntu + ansible.builtin.package: + state: latest + name: + - openjdk-17-jdk + when: + - ansible_facts['distribution'] == 'Debian' or ansible_facts['distribution'] == 'Ubuntu' - file: path=/opt/pulsar state=absent - file: path=/opt/pulsar state=directory - name: Download Pulsar binary package @@ -467,27 +487,9 @@ connection: ssh become: true tasks: - - name: Add Extras Repo - shell: yum-config-manager --enable rhui-REGION-rhel-server-extras - when: - - ansible_facts['distribution'] == 'RedHat' - - ansible_facts['distribution_major_version'] | int <= 7 - - name: Docker repo - yum_repository: - name: docker - description: repo for docker - baseurl: "https://download.docker.com/linux/centos/{{ ansible_facts['distribution_major_version'] }}/x86_64/stable/" - gpgcheck: no - when: ansible_facts['distribution'] == 'RedHat' - - name: Installing docker - yum: - state: latest - pkg: ['docker-ce'] - - name: Start docker - service: - name: docker - state: started - enabled: yes + - name: Install docker + include_role: + name: geerlingguy.docker - file: path=/opt/prometheus state=absent - file: path=/opt/prometheus state=directory - name: Download Prometheus Binary Package diff --git a/driver-pulsar/deploy/ssd/provision-pulsar-aws.tf b/driver-pulsar/deploy/ssd/provision-pulsar-aws.tf index 3ded96e31..946312202 100644 --- a/driver-pulsar/deploy/ssd/provision-pulsar-aws.tf +++ b/driver-pulsar/deploy/ssd/provision-pulsar-aws.tf @@ -37,12 +37,18 @@ variable "key_name" { variable "region" {} variable "az" {} variable "ami" {} +variable "user" {} variable "spot" {} variable "instance_types" {} variable "num_instances" {} provider "aws" { region = var.region + default_tags { + tags = { + Project = "pulsar-benchmark" + } + } } # Create a VPC to launch our instances into @@ -139,7 +145,7 @@ resource "aws_instance" "zookeeper" { content { market_type = "spot" spot_options { - max_price = 0.5 + max_price = 3.0 } } } @@ -162,7 +168,7 @@ resource "aws_instance" "pulsar" { content { market_type = "spot" spot_options { - max_price = 0.7 + max_price = 3.0 } } } @@ -185,7 +191,7 @@ resource "aws_instance" "client" { content { market_type = "spot" spot_options { - max_price = 0.9 + max_price = 3.0 } } } @@ -226,7 +232,7 @@ resource "ansible_host" "zookeeper" { variables = { # Connection vars. - ansible_user = "ec2-user" # Default user depends on the OS. + ansible_user = var.user # Default user depends on the OS. ansible_host = aws_instance.zookeeper[count.index].public_ip # Custom vars that we might use in roles/tasks. @@ -239,7 +245,7 @@ resource "ansible_host" "pulsar" { variables = { # Connection vars. - ansible_user = "ec2-user" # Default user depends on the OS. + ansible_user = var.user # Default user depends on the OS. ansible_host = aws_instance.pulsar[count.index].public_ip # Custom vars that we might use in roles/tasks. @@ -252,7 +258,7 @@ resource "ansible_host" "client" { variables = { # Connection vars. - ansible_user = "ec2-user" # Default user depends on the OS. + ansible_user = var.user # Default user depends on the OS. ansible_host = aws_instance.client[count.index].public_ip # Custom vars that we might use in roles/tasks. @@ -265,7 +271,7 @@ resource "ansible_host" "prometheus" { variables = { # Connection vars. - ansible_user = "ec2-user" # Default user depends on the OS. + ansible_user = var.user # Default user depends on the OS. ansible_host = aws_instance.prometheus[count.index].public_ip # Custom vars that we might use in roles/tasks. diff --git a/driver-pulsar/deploy/ssd/terraform.tfvars b/driver-pulsar/deploy/ssd/terraform.tfvars index 1307bbbbb..2c4650f39 100644 --- a/driver-pulsar/deploy/ssd/terraform.tfvars +++ b/driver-pulsar/deploy/ssd/terraform.tfvars @@ -1,7 +1,8 @@ public_key_path = "~/.ssh/pulsar_aws.pub" -region = "us-west-2" -az = "us-west-2a" -ami = "ami-08970fb2e5767e3b8" // RHEL-8 +region = "us-east-2" +az = "us-east-2a" +ami = "ami-012e6364f6bd17628" +user = "ubuntu" spot = false instance_types = {