Ansible Practice
install network manager package and add the user devops_user
``` yml=
-
name: Display installed packages hosts: web tasks:
- name: Install the package
apt:
name:
- apache2=2.4.41-4ubuntu3.12
- network-manager
-
name: Gather info on installed packages package_facts: manager: auto
-
name: List installed packages debug: var: ansible_facts.packages
-
name: Display NetworkManager version debug: msg: "Version {{ansible_facts.packages['network-manager'][0].version}}" when: "'network-manager' in ansible_facts.packages"
- name: New user devops hosts: web tasks:
- name: Verify that auditors group exists
group:
name: "{{ item }}"
state: present
loop:
- sys_admins
- developers
- name: Add new user to the development machine and assign the appropriate groups.
user:
name: devops_user
shell: /bin/bash
groups: sys_admins,developers
append: yes
generate_ssh_key: yes
ssh_key_bits: 2048
ssh_key_file: .ssh/id_my_rsa
## customized tool installation group_vars/all.yml ``` yml env=production installation_folder_dir: /tmp/sumtool sumtool_folder: "{{ installation_folder_dir }}/sum_2.9.0_Linux_x86_64" sumtool_url: https://www.supermicro.com/Bios/sw_download/466/sum_2.9.0_Linux_x86_64_20220804.tar.gz cluster_bmc_credential: "/foo/{{ env }}/config/cluster-bmc-credential.txt"
- name: Install the package
apt:
name:
playbook.yml
- name: create `temp-installation` folder
file:
path: "{{ installation_folder_dir }}"
state: directory
- name: check sumtool directory exists
stat:
path: "{{ sumtool_folder }}"
register: sum_command
- name: download the Supermicro SUM tool
get_url:
url: "{{ sumtool_url }}"
dest: "{{ installation_folder_dir }}"
when: sum_command.stat.exists == False
- name: uncompress the tar file
unarchive:
src: "{{ installation_folder_dir }}/{{ sumtool_url | basename}}"
dest: "{{ installation_folder_dir }}"
remote_src: yes
list_files: yes
register: unzip_contents
- name: set variable
set_fact:
sumtool_dir: "{{ installation_folder_dir }}/{{ unzip_contents.files[0] }}"
cpu_series: "{{ 7002 if ansible_processor[2] | regex_search('7[0-9][0-9]2') is not none else 7003}}"
- name: show cpu_series
debug:
msg:
- "{{ cpu_series }}"
- "{{ ansible_processor[2] }}"
- name: check python-lxml is installed
shell: pip3 list | grep lxml
register: pip3_list
ignore_errors: true
# NOTE: To use ansible xml, it's necessary to install python3 and python3-lxml.
- name: install python-lxml
pip:
name: lxml
when: pip3_list.rc != 0
- name: get ipmi ip
shell: |
cat "{{cluster_bmc_credential}}" | grep "{{target_host}}" | awk '{split($4,a,"/");print a[2]}'
register: ipmi_ip
- name: get ipmi password
shell: |
cat "{{cluster_bmc_credential}}" | grep "{{target_host}}" | awk '{print $6}'
register: ipmi_password
- name: get the current BIOS configurations
shell:
chdir: "{{ sumtool_dir }}"
cmd: "./sum -i {{ipmi_ip.stdout}} -u ADMIN -p {{ipmi_password.stdout}} -c GetCurrentBiosCfg --overwrite --file {{ sumtool_dir }}/{{original_bios_config_file}}"
ignore_errors: true
become: true
install ansible
- name: Install the package group
ansible.builtin.yum:
name: "{{ item }}"
state: present
become: yes
loop:
- "@Infiniband Support"
- "@Performance Tools"
- "@Development Tools"
- "@Scientific Support"
- "@System Tools"
- python39
when: ansible_distribution=="RedHat"
- include_role:
name: ipmitool_package
- name: Install ansible
ansible.builtin.pip:
name: ansible-core==2.14
executable: pip3.9
check sw status and use template
---
- name: Validate Cluster Preparation
hosts: cluster_sn,cluster_ln,cluster_an
vars:
default_owner: kapl
default_group: kapl
tasks:
- name: Node Type
set_fact:
node_type: "{{ group_names[1].split(\"_\")[1] }}"
- name: Debug Node Type
debug:
msg: "node type: {{ node_type }}"
- name: Passwordless - root - authorized_keys
stat:
path: /root/.ssh/authorized_keys
register: passwordless_root_authorized_keys
- name: Passwordless - root - cn001 public key
shell: grep 'root@k22cn001' /root/.ssh/authorized_keys
register: passwordless_root_control_node_1
- name: Passwordless - root - cn002 public key
shell: grep 'root@k22cn002' /root/.ssh/authorized_keys
register: passwordless_root_control_node_2
- name: Passwordless - kapl - authorized_keys
stat:
path: /home/kapl/.ssh/authorized_keys
register: passwordless_kapl_authorized_keys
- name: Passwordless - kapl - cn001 public key
shell: grep 'kapl@k22cn001' /home/kapl/.ssh/authorized_keys
register: passwordless_kapl_control_node_1
- name: Passwordless - kapl - cn002 public key
shell: grep 'kapl@k22cn002' /home/kapl/.ssh/authorized_keys
register: passwordless_kapl_control_node_2
- name: Passwordless - status
set_fact:
passwordless_status:
control_node_1:
root: "{{ passwordless_root_authorized_keys.stat.exists and passwordless_root_control_node_1.rc == 0 }}"
kapl: "{{ passwordless_kapl_authorized_keys.stat.exists and passwordless_kapl_control_node_1.rc == 0 }}"
control_node_2:
root: "{{ passwordless_root_authorized_keys.stat.exists and passwordless_root_control_node_2.rc == 0 }}"
kapl: "{{ passwordless_kapl_authorized_keys.stat.exists and passwordless_kapl_control_node_2.rc == 0 }}"
- name: Debug Passwordless - kapl
debug:
var: passwordless_status
- name: Get CPU SMT
shell: cat /sys/devices/system/cpu/smt/active
register: cpu_smt
- name: CPU - SMT status
set_fact:
cpu_smt_status: "{% if (cpu_smt.stdout == '1' ) %}true{% else %}false{% endif %}"
- name: Fetch interface names
debug:
msg: "{{ item }}"
loop: "{{ ansible_facts.interfaces | select('match', '^enp[0-9]+s0f0$')}}"
- name: Fetch interface and set
set_fact:
network_10g_interface_name: "{{ item }}"
loop: "{{ ansible_facts.interfaces | select('match', '^enp[0-9]+s0f0$')}}"
- name: Set fact for network - 10g
set_fact:
network_10g:
name: "{{ network_10g_interface_name }}"
mtu: "{{ ansible_facts[network_10g_interface_name].mtu }}"
speed: "{{ ansible_facts[network_10g_interface_name].speed }}"
driver: {}
firmware: {}
network_100g:
name: ib0
mtu: "{{ ansible_facts['ib0'].mtu }}"
speed: "{{ ansible_facts['ib0'].speed }}"
driver: {}
firmware: {}
- name: Get network driver - 10g
shell: ethtool -i {{ network_10g.name }} | grep version | head -n 1 | awk '{print $2}'
register: network_10g_driver
- name: Get network firmware - 10g
shell: ethtool -i {{ network_10g.name }} | grep firmware-version | awk '{print $3}'
register: network_10g_firmware
- name: Get network driver - 100g
shell: ethtool -i {{ network_100g.name }} | grep version | head -n 1 | awk '{print $2}'
register: network_100g_driver
- name: Get network firmware - 100g
shell: ethtool -i {{ network_100g.name }} | grep firmware-version | awk '{print $2}'
register: network_100g_firmware
- name: Update network driver, firmware
ansible.utils.update_fact:
updates:
- path: network_10g.driver.version
value: "{{ network_10g_driver.stdout }}"
- path: network_10g.firmware.version
value: "{{ network_10g_firmware.stdout }}"
- path: network_100g.driver.version
value: "{{ network_100g_driver.stdout }}"
- path: network_100g.firmware.version
value: "{{ network_100g_firmware.stdout }}"
register: updated
- name: Update fact for network
set_fact:
network_10g: "{{ updated.network_10g }}"
network_100g: "{{ updated.network_100g }}"
- name: Debug network variables
debug:
msg:
- "{{ network_10g }}"
- "{{ network_100g }}"
- name: Get subscription info
shell: subscription-manager status
register: subscription_info
become: yes
- name: set_status either to True or False
set_fact:
subscription_status: "{% if (subscription_info.stdout_lines | regex_search('Overall Status: Current')) %}true{% else %}false{% endif %}"
- name: Benchmark - HPL code drop
stat:
path: /tmp/Ansible_HPL/install_packages/amd-zen-hpl-avx2-2023_01.tar
register: hpl_code_drop
- name: Benchmark - HPL prebuilt env
stat:
path: /AMD-HPL
register: hpl_prebuilt_env
- name: Benchmark - HPL status
set_fact:
benchmark_hpl_status: "{% if (hpl_code_drop.stat.exists and hpl_prebuilt_env.stat.exists ) %}true{% else %}false{% endif %}"
- name: Benchmark - MPI spack
shell: source /tmp/spack/share/spack/setup-env.sh; spack --version
register: mpi_spack
ignore_errors: yes
- name: Benchmark - MPI status
set_fact:
benchmark_mpi_status: "{% if (mpi_spack.rc == 0 ) %}true{% else %}false{% endif %}"
benchmark_mpi_spack_version: "{{ mpi_spack.stdout }}"
- name: Benchmark - SPECRate install directory
stat:
path: /tmp/Ansible_SPEC-CPU-2017/cpu2017
register: spec_rate_install_directory
- name: Benchmark - SPECRate config
stat:
path: /tmp/Ansible_SPEC-CPU-2017/install_packages/config/spec-cpu_fprate_amd-7742_2_aocc_320_linux_x86.tar.xz
register: spec_rate_config
- name: Benchmark - SPECRate status
set_fact:
benchmark_spec_rate_status: "{% if (spec_rate_install_directory.stat.exists and spec_rate_config.stat.exists ) %}true{% else %}false{% endif %}"
- name: Benchmark - SPECAccel install directory
stat:
path: /tmp/Ansible_SPEC-ACCEL/accel
register: spec_accel_install_directory
- name: Benchmark - SPECAccel config
stat:
path: /tmp/Ansible_SPEC-ACCEL/install_packages/config/spec-accel_openacc_amd-7742_2_aocc_320_linux_x86.cfg
register: spec_accel_config
- name: Benchmark - SPECAccel flag
stat:
path: /tmp/Ansible_SPEC-ACCEL/install_packages/config/nvidia_flags.xml
register: spec_accel_flag
- name: Benchmark - SPECAccel status
set_fact:
benchmark_spec_accel_status: "{% if (spec_accel_install_directory.stat.exists and spec_accel_config.stat.exists and spec_accel_flag.stat.exists ) %}true{% else %}false{% endif %}"
- name: Benchmark - iperf3
shell: rpm -qa | grep iperf3
register: iperf3_installed
ignore_errors: yes
when: ansible_os_family == "RedHat"
- name: Benchmark - iperf3 status
set_fact:
benchmark_iperf3_status: "{% if ( iperf3_installed.rc == 0 ) %}true{% else %}false{% endif %}"
- name: Mount - /opt_shared
shell: mount | grep /opt_shared | awk '{print $1, $2, $3, $4, $5}'
register: mount_opt_shared_result
- name: Mount - /opt_shared info
set_fact:
nfs_client:
mount_opt_shared_status: "{{ mount_opt_shared_result.rc == 0 and 'nfs' in mount_opt_shared_result.stdout }}"
mount_opt_shared_info: "{{ mount_opt_shared_result.stdout }}"
- name: lsblk info
shell: lsblk
register: lsblk_result
- name: df -hT info
shell: df -hT
register: df_result
- name: Fetch disk name and set
set_fact:
disk_names: "{{ ansible_facts.devices.keys() | select('match', '^nvme[0-9]{1}n[0-9]{1}') | map('regex_replace', '^(.*)$', '/dev/\\1') | list }}"
- name: Debug disk names
debug:
var: disk_names
- name: smartctl info
shell: "smartctl --all {{ item }}"
loop: "{{ disk_names }}"
register: smartctl_result
- name: Debug directory for data file
debug:
msg:
- "{{ cluster_node_bom_host }}/{{ inventory_hostname }}"
- name: Create directory for data file
file:
path: "{{ item }}"
state: directory
owner: "{{ default_owner }}"
group: "{{ default_group }}"
mode: '0777'
loop:
- "{{ cluster_node_bom_host }}/{{ inventory_hostname }}"
- name: Generate host bom txt
template:
src: ../templates/host_bom.txt.j2
dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/{{ node_type }}-bom.txt"
owner: "{{ default_owner }}"
group: "{{ default_group }}"
- name: Generate host bom json
template:
src: ../templates/host_bom.json.j2
dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/{{ node_type }}-bom.json"
owner: "{{ default_owner }}"
group: "{{ default_group }}"
- name: Generate lsblk file
template:
src: ../templates/host_lsblk.txt.j2
dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/lsblk.txt"
owner: "{{ default_owner }}"
group: "{{ default_group }}"
- name: Generate df file
template:
src: ../templates/host_df.txt.j2
dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/df.txt"
owner: "{{ default_owner }}"
group: "{{ default_group }}"
- name: Generate smartctl file
template:
src: ../templates/host_smartctl.txt.j2
dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/smartctl.txt"
owner: "{{ default_owner }}"
group: "{{ default_group }}"
host_bom.json.j2
{
"os_version": "{{ ansible_distribution_version }}",
"kernel_version": "{{ ansible_kernel }}",
"cpu": {
"smt": {{ cpu_smt_status | to_json }}
}
"network": {
"10g": {{ network_10g | to_json }},
"100g": {{ network_100g | to_json }}
}
},
"license": {{ subscription_status | to_json }},
"benchmark": {
"hpl": {"installed": {{ benchmark_hpl_status | to_json }} },
"mpi": {"installed": {{ benchmark_mpi_status | to_json }} },
"spec_rate": {"installed": {{ benchmark_spec_rate_status | to_json }} },
"spec_accel": {"installed": {{ benchmark_spec_accel_status | to_json }} },
"iperf3": {"installed": {{ benchmark_iperf3_status | to_json }} }
},
"passwordless": {{ passwordless_status | to_json }},
"nfs_client": {{ nfs_client | to_json }}
}
host_bom.txt.j2
##! Title: Get Node Bom {{ node_type | upper }} ({{ inventory_hostname }})
##! Priority: 1
##! TimeStamp: {{ "%m%d:%H%M" | strftime(ansible_date_time.epoch) }}
{{ "%-35s%-30s%-30s" | format("Name", "Value", "Notes") }}
---------------------------------------------------------------------
{{ "%-35s%-30s%-30s" | format("OS-Version", ansible_distribution_version, "-") }}
{{ "%-35s%-30s%-30s" | format("Kernel-Version", ansible_kernel, "-") }}
{{ "%-35s%-30s%-30s" | format("CPU-SMT", cpu_smt_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-Name", network_10g.name, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-Driver-Version", network_10g.driver.version, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-FW-Version", network_10g.firmware.version, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-MTU", network_10g.mtu, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-Speed", network_10g.speed, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-Name", network_100g.name, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-Driver-Version", network_100g.driver.version, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-FW-Version", network_100g.firmware.version, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-MTU", network_100g.mtu, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-Speed", network_100g.speed, "-") }}
{{ "%-35s%-30s%-30s" | format("License-Registered", subscription_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-HPL-Installed", benchmark_hpl_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-MPI-Installed", benchmark_mpi_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-SPECRate-Installed", benchmark_spec_rate_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-SPECAccel-Installed", benchmark_spec_accel_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-Iperf-Installed", benchmark_iperf3_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Passwordless-cn001-root", passwordless_status.control_node_1.root, "-") }}
{{ "%-35s%-30s%-30s" | format("Passwordless-cn001-kapl", passwordless_status.control_node_1.kapl, "-") }}
{{ "%-35s%-30s%-30s" | format("Passwordless-cn002-root", passwordless_status.control_node_2.root, "-") }}
{{ "%-35s%-30s%-30s" | format("Passwordless-cn002-kapl", passwordless_status.control_node_2.kapl, "-") }}
{{ "%-35s%-30s%-30s" | format("NFS-Client-Mount", nfs_client.mount_opt_shared_status, nfs_client.mount_opt_shared_info) }}
host_smartctl.txt.j2
------------------------------------------------------------------
{{ smartctl_result.results[0].item }}
------------------------------------------------------------------
{{ smartctl_result.results[0].stdout }}
------------------------------------------------------------------
{{ smartctl_result.results[1].item }}
------------------------------------------------------------------
{{ smartctl_result.results[1].stdout }}
cron
```yml=
- name: Control Node Crontab setting
hosts: cluster_cn
vars:
- root_dir: "{{ project_root }}/{{ project_env }}"
- primary_control_node: k22cn001
- stop_cron_job: false become: yes tasks:
- name: set_fact set_fact: stop_cron_job: true when: "inventory_hostname != primary_control_node and primary_control_node in ansible_play_hosts"
- name: check stop_cron_job debug: var: stop_cron_job
- name: Creates an entry like SMC_ROOT_DIR on top of crontab ansible.builtin.cron: name: SMC_ROOT_DIR user: root env: yes job: "{{ root_dir }}"
- name: Creates an entry like CRON_ENV to indicate if invaking by crontab ansible.builtin.cron: name: CRON_ENV user: root env: yes job: 1
- name: Cron for root user - fetch pdu power ansible.builtin.cron: name: "fetch the metrics about PDU power" user: root minute: "1-59" job: "${SMC_ROOT_DIR}/bin/pdu-power/fetch-metrics-multiple-node.sh pdu" disabled: "{{stop_cron_job}}"
- name: Cron for root user - retention pdu power ansible.builtin.cron: name: "retention the metrics about PDU power" user: root hour: "1" minute: "0" job: "${SMC_ROOT_DIR}/bin/pdu-power/retention-metrics-multiple-node.sh pdu" disabled: "{{stop_cron_job}}"
- name: Creates an entry like SMC_ROOT_DIR on top of crontab ansible.builtin.cron: name: SMC_ROOT_DIR user: kapl env: yes job: "{{root_dir}}"
- name: Cron for kapl user - txt to csv ansible.builtin.cron: name: "transfer txt to csv" user: kapl minute: "2-59/5" job: "${SMC_ROOT_DIR}/bin/supports-misc/403-cron-txt-to-csv.sh.x" disabled: "{{stop_cron_job}}"
- name: Cron for kapl user - csv to html ansible.builtin.cron: name: "transfer csv to html" user: kapl minute: "3-59/5" job: "${SMC_ROOT_DIR}/bin/supports-misc/404-cron-csv-to-html.sh.x" disabled: "{{ stop_cron_job }}"
- name: Cron for kapl user - csv to json ansible.builtin.cron: name: "transfer csv to json" user: kapl minute: "4-59/5" job: "${SMC_ROOT_DIR}/bin/supports-misc/405-cron-csv-to-json.sh.x" disabled: "{{ stop_cron_job }}"
- name: Compute Node Crontab setting
hosts: cluster_sn, cluster_ln, cluster_an
vars:
- root_dir: "{{ project_root }}/{{ project_env }}" become: yes gather_facts: no tasks:
- name: Creates an entry like SMC_ROOT_DIR on top of crontab ansible.builtin.cron: name: SMC_ROOT_DIR user: root env: yes job: "{{ root_dir }}"
- name: Cron for root user - fetch power temparature ansible.builtin.cron: name: "fetch the metrics about Power Temparature" user: root minute: "1-59" job: "${SMC_ROOT_DIR}/bin/pdu-power/fetch-metrics-single-node.sh"
- name: Cron for root user - retention power temparature ansible.builtin.cron: name: "retention the metrics about Power Temparature" user: root hour: "1" minute: "0" job: "${SMC_ROOT_DIR}/bin/pdu-power/retention-metrics-single-node.sh"
## nagios
### install
``` yml
---
- name: Install Nagios Core and Plugins
hosts: cluster_cn
vars:
core_version: 4.4.8
plugins_version: 2.4.0
core_install_dir: /tmp/nagios-{{core_version}}
plugins_install_dir: /tmp/nagios-plugins-release-{{ plugins_version }}
user_file: /usr/local/nagios/etc/htpasswd.users
vars_files:
- ../../vars/rhel.yml
become: yes
tasks:
- name: Reset project root
set_fact:
project_root: /opt_shared-master/smci/kapl
when: inventory_hostname == 'k22cn002'
- name: Debug
debug:
msg: "{{ ansible_distribution_major_version }} {{ project_root }}"
- name: Disale SELINUX
shell: sed -i 's/^SELINUX=.*$/SELINUX=disabled/' /etc/selinux/config
- name: Install Nagios Prerequisites
yum:
name:
- gcc
- glibc
- glibc-common
- perl
- httpd
- php
- wget
- gd
- gd-devel
- openssl-devel
state: present
when: ansible_distribution == "RedHat" and ansible_distribution_major_version == "8"
# https://support.nagios.com/kb/article.php?id=569#RHEL
- name: Install epel rpm from a local file
yum:
name: "{{ project_repo }}/linux/rpm/epel-release-latest-8.noarch.rpm"
state: present
- name: Enable repo
shell: subscription-manager repos --enable "codeready-builder-for-rhel-8-x86_64-rpms"
- name: Install Nagios Plugins Prerequisites
yum:
name:
- make
- gettext
- automake
- autoconf
- net-snmp
- net-snmp-utils
- perl-Net-SNMP
- fping
- lm_sensors
state: present
when: ansible_distribution == "RedHat" and ansible_distribution_major_version == "8"
- name: Get rid of semi-created installation dir
file:
path: "{{ item }}"
state: absent
loop:
- "{{ core_install_dir }}"
- "{{ plugins_install_dir }}"
- name: Copy core file
copy:
src: "{{ project_repo }}/linux/tar-gz/nagios-{{core_version}}.tar.gz"
dest: "{{ core_install_dir }}.tar.gz"
- name: Copy plugins file
copy:
src: "{{ project_repo }}/linux/tar-gz/nagios-plugins-release-{{ plugins_version }}.tar.gz"
dest: "{{ plugins_install_dir }}.tar.gz"
- name: Create directory for uncompressed
file:
path: "{{ item }}"
state: directory
mode: '0777'
loop:
- "{{ core_install_dir }}"
- "{{ plugins_install_dir }}"
- "{{ project_log }}/nagios"
- name: Extract Nagios Core
unarchive:
src: "{{ core_install_dir }}.tar.gz"
dest: "{{ core_install_dir }}"
extra_opts:
- --strip-components=1
- name: Extract Nagios Pligins
unarchive:
src: "{{ plugins_install_dir }}.tar.gz"
dest: "{{ plugins_install_dir }}"
extra_opts:
- --strip-components=1
- name: Compile
shell: "{{ item }} >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-compile.log"
args:
chdir: "{{ core_install_dir }}"
loop:
- ./configure
- make all
- name: Create User And Group
shell: "{{ item }} >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install-groups-users.log"
args:
chdir: "{{ core_install_dir }}"
loop:
- make install-groups-users
- usermod -a -G nagios apache
- name: Install Binaries, Service / Daemon
shell: "{{ item }} >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install.log"
args:
chdir: "{{ core_install_dir }}"
loop:
- make install
- make install-daemoninit
- name: Install Command Mode
shell: "make install-commandmode >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install-commandmode.log"
args:
chdir: "{{ core_install_dir }}"
- name: Install Configuration Files
shell: "make install-config >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install-config.log"
args:
chdir: "{{ core_install_dir }}"
- name: Install Apache Config Files
shell: "make install-webconf >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install-webconf.log"
args:
chdir: "{{ core_install_dir }}"
- name: Configure Firewall for Nagios Core web interface
shell: "{{ item }}"
loop:
- firewall-cmd --zone=public --add-port=80/tcp
- firewall-cmd --zone=public --add-port=80/tcp --permanent
- name: Get rid of nagios pass file
file:
path: "{{ user_file }}"
state: absent
- name: Create nagiosadmin User Account
shell: "htpasswd -b -c {{ user_file }} nagiosadmin {{ kapl_password }}"
- name: Create kapl User Account
shell: "htpasswd -b {{ user_file }} {{kapl_username}} {{ kapl_password }}"
- name: Plugins - Compile
shell: "{{ item }} >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-plugins-compile.log"
args:
chdir: "{{ plugins_install_dir }}"
loop:
- ./tools/setup
- ./configure
- make
- name: Plugins - Install
shell: "make install >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-plugins-install.log"
args:
chdir: "{{ plugins_install_dir }}"
- name: Plugins - Verify if installed
stat:
path: /usr/local/nagios/libexec/check_ping
register: plugins_installed
- name: Debug Plugins - Verify if installed
debug:
msg: "Plugins are installed: {{ plugins_installed.stat.exists }}"
- name: run and enable systemd service
service:
name: "{{ item }}"
state: restarted
enabled: yes
become: yes
loop:
- httpd
- nagios
config
---
- name: Config Nagios Core and Plugins
hosts: cluster_cn
vars:
config_dir: /usr/local/nagios/etc
vars_files:
- ../../vars/rhel.yml
become: yes
tasks:
- name: Reset project root
set_fact:
project_root: /opt_shared-master/smci/kapl
when: inventory_hostname == 'k22cn002'
- name: Debug
debug:
msg: "{{ groups['cluster_cn'] | join(',') }} {{ project_root }}"
- name: Check user privilege
shell: "grep 'nagiosadmin,{{ kapl_username }}' {{ config_dir }}/cgi.cfg"
register: check_user_priv
- name: Config users privilege
replace:
path: "{{ config_dir }}/cgi.cfg"
regexp: 'nagiosadmin'
replace: "nagiosadmin,{{ kapl_username }}"
backup: no
when: check_user_priv.rc == 1
- name: Ensure swiches configuration dir
lineinfile:
path: "{{ config_dir }}/nagios.cfg"
regexp: "^cfg_dir={{ config_dir }}/switches"
insertafter: "^#cfg_dir={{ config_dir }}/switches"
line: "cfg_dir={{ config_dir }}/switches"
- name: Ensure server configuration dir
lineinfile:
path: "{{ config_dir }}/nagios.cfg"
regexp: "^cfg_dir={{ config_dir }}/servers"
insertafter: "^#cfg_dir={{ config_dir }}/servers"
line: "cfg_dir={{ config_dir }}/servers"
- name: Ensure pdu configuration dir
lineinfile:
path: "{{ config_dir }}/nagios.cfg"
regexp: "^cfg_dir={{ config_dir }}/routers"
insertafter: "^#cfg_dir={{ config_dir }}/routers"
line: "cfg_dir={{ config_dir }}/pdus"
- name: Create directory for configuration
file:
path: "{{ item }}"
state: directory
owner: nagios
group: nagios
mode: '0755'
loop:
- "{{ config_dir }}/servers"
- "{{ config_dir }}/switches"
- "{{ config_dir }}/pdus"
- name: Generate control node config
template:
src: ../../templates/nagios/control_node.cfg.j2
dest: "{{ config_dir }}/servers/{{ inventory_hostname }}.cfg"
owner: nagios
group: nagios
- name: Generate group config
template:
src: ../../templates/nagios/group_node_type.cfg.j2
dest: "{{ config_dir }}/servers/group_node_type.cfg"
owner: nagios
group: nagios
control_node.cfg.j2
###############################################################################
# LOCALHOST.CFG - SAMPLE OBJECT CONFIG FILE FOR MONITORING THIS MACHINE
#
#
# NOTE: This config file is intended to serve as an *extremely* simple
# example of how you can create configuration entries to monitor
# the local (Linux) machine.
#
###############################################################################
###############################################################################
#
# HOST DEFINITION
#
###############################################################################
# Define a host for the local machine
define host {
use linux-server ; Name of host template to use
; This host definition will inherit all variables that are defined
; in (or inherited by) the linux-server host template definition.
host_name {{ inventory_hostname }}
alias {{ inventory_hostname }}
address {{ hostvars[inventory_hostname]['ansible_host'] }}
}
###############################################################################
#
# SERVICE DEFINITIONS
#
###############################################################################
# Define a service to "ping" the local machine
define service {
use generic-service ; Name of service template to use
host_name {{ inventory_hostname }}
service_description PING
check_command check_ping!100.0,20%!500.0,60%
}
define service {
use generic-service ; Name of service template to use
host_name {{ inventory_hostname }}
service_description SSH
check_command check_ssh
notifications_enabled 0
}
# Define a service to check HTTP on the local machine.
# Disable notifications for this service by default, as not all users may have HTTP enabled.
define service {
use generic-service ; Name of service template to use
host_name {{ inventory_hostname }}
service_description HTTP
check_command check_http
notifications_enabled 0
}
group_node_type.cfg.j2
###############################################################################
#
# HOST GROUP DEFINITION
#
###############################################################################
# Define an optional hostgroup for Linux machines
define hostgroup {
hostgroup_name cn ; The name of the hostgroup
alias Control Node ; Long name of the group
members {{ groups['cluster_cn'] | join(',') }} ; Comma separated list of hosts that belong to this group
}
define hostgroup {
hostgroup_name sn ; The name of the hostgroup
alias Small Node ; Long name of the group
members {{ groups['cluster_sn'] | join(',') }} ; Comma separated list of hosts that belong to this group
}
define hostgroup {
hostgroup_name ln ; The name of the hostgroup
alias Large Node ; Long name of the group
members {{ groups['cluster_ln'] | join(',') }} ; Comma separated list of hosts that belong to this group
}
define hostgroup {
hostgroup_name an ; The name of the hostgroup
alias Accelerated Node ; Long name of the group
members {{ groups['cluster_an'] | join(',') }} ; Comma separated list of hosts that belong to this group
}
install package and verify network
---
- name: Initial validation for control node
hosts: cluster_cn
vars_files:
- ../vars/rhel.yml
tasks:
- name: Fetch interface and set
set_fact:
network_10g_interface_name: "{{ item }}"
loop: "{{ ansible_facts.interfaces | select('match', '^enp[0-9]+s0f0$')}}"
- name: set interface MTU 9000
shell: |
ifconfig {{ item }} mtu 9000
loop:
- "{{ network_10g_interface_name }}"
- name: set interface MTU 9000 permanent
lineinfile:
path: /etc/sysconfig/network-scripts/ifcfg-{{item}}
line: MTU=9000
state: present
loop:
- "{{ network_10g_interface_name }}"
become: yes
- name: Verify foo account and subscription
include_role:
name: rhel_account
- name: Prevent Kernel Upgrades
ansible.builtin.lineinfile:
path: /etc/yum.conf
line: exclude=kernel* redhat-release* kmod-kvdo
become: yes
- name: Allow chronyd works
ansible.builtin.lineinfile:
path: /etc/chrony.conf
line: allow 167.22.10.2
become: yes
when: inventory_hostname == primary_control_node
- name: Set timezone
community.general.timezone:
name: America/Los_Angeles
- name: stop and disable systemd service
service:
name: "{{ item }}"
state: stopped
enabled: no
become: yes
loop:
- firewalld
when: inventory_hostname == primary_control_node
- name: run and enable systemd service
service:
name: "{{ item }}"
state: started
enabled: yes
become: yes
loop:
- firewalld
when: inventory_hostname == backup_control_node
- name: run and enable systemd service
service:
name: "{{ item }}"
state: started
enabled: yes
become: yes
loop:
- chronyd
- nfs-server
- name: check network reachability - internet and dns
shell: ping -c 2 "{{ item }}"
loop:
- 8.8.8.8
- www.google.com
- name: check network reachability - 10gb network
shell: ping -c 2 167.22.10.2 -I enp33s0f0
become: yes
register: ping_result
failed_when: ping_result.rc != 0
ignore_errors: yes
- name: check network reachability - infiniband network
shell: ping -c 2 167.122.10.2 -I ib0
become: yes
register: ping_result
failed_when: ping_result.rc != 0
ignore_errors: yes
- name: check network reachability - BMC network
shell: ping -c 2 167.222.10.2
register: ping_result
failed_when: ping_result.rc != 0
ignore_errors: yes
- name: check chronyd
shell: chronyc tracking
register: chronyc_tracking
failed_when: "'Leap status : Normal' not in chronyc_tracking.stdout"
- name: Install the package group
ansible.builtin.yum:
name: "{{ item }}"
state: present
disablerepo: '*'
enablerepo: local-rhels8.7.0-x86_64--install-rhels8.7.0-x86_64-AppStream,local-rhels8.7.0-x86_64--install-rhels8.7.0-x86_64-BaseOS
become: yes
loop:
- "@Infiniband Support"
- "@Performance Tools"
- "@Development Tools"
- "@Scientific Support"
- "@System Tools"
- python39
- sshpass
when: ansible_distribution=="RedHat"
- include_role:
name: ipmitool_package
- name: Install ansible
ansible.builtin.pip:
name:
- ansible-core==2.14
- python-ipmi==0.5.4
- ansible-pylibssh==1.1.0 # for cisco switch
executable: pip3.9
- name: Install ansible collections
shell: ansible-galaxy collection install {{ item }}
loop:
- community.general
- ansible.posix
- ansible.utils
- cisco.nxos # for cisco switch
Get a list of unreachable hosts in an ansible playbook
``` yml= - name Unreachable servers set_fact: down: "{{ ansible_play_hosts_all | difference(ansible_play_hosts)}}"
## Node Sync
### sync from cn001 to cn002
Method 1: if execute it on cn001
``` bash
ansible-playbook playbook/playbook_sync.yml --limit 'cn002' --extra-vars "mode=push"
Method 2: if execute it on cn002 and mode is pull, use sudo
sync from cn002 to cn001
Method 1: if execute it on cn001 and mode is pull, use sudo
Method 2: if execute it on cn002
sync only development content like script, config from cn001 to cn002
if execute it on cn002 and mode is pull, use sudo
sudo ansible-playbook playbook/playbook_sync.yml --tags develop --limit 'cn001' --extra-vars "mode=pull"
sync only ha content like data, reports, logs from cn001 to cn002
if execute it on cn002 and mode is pull, use sudo
sudo ansible-playbook playbook/playbook_sync.yml --tags data --limit 'cn001' --extra-vars "mode=pull"
``` yml=
- name: Sync Process
hosts: cluster_cn
become: yes
vars:
- devlopment_content:
- bin
- config
- scripts
- data_content:
- data
- reports
- logs tasks:
- name: Show the sync mode debug: msg: - "{{ mode }} mode" - "push mode from machine which execute ansible to {{ inventory_hostname }}" - "pull mode from {{ inventory_hostname }} to machine which execute ansible"
- name: Synchronization development content like script, config using rsync protocol
ansible.posix.synchronize:
mode: "{{ mode }}"
src: "{{ project_root }}/{{ project_env }}/{{ item }}/"
dest: "{{ project_root }}/{{ project_env }}/{{ item }}/"
loop: "{{ devlopment_content }}"
register: develop_sync_result
tags:
- develop
- name: Show sync result
debug:
msg: "{{ item.stdout_lines }}"
loop: "{{ develop_sync_result.results }}"
tags:
- develop
- name: Synchronization ha content like data, reports, logs using rsync protocol
ansible.posix.synchronize:
mode: "{{ mode }}"
src: "{{ project_root }}/{{ project_env }}/{{ item }}/"
dest: "{{ project_root }}/{{ project_env }}/{{ item }}/"
rsync_opts:
- "--min-size=1" loop: "{{ data_content }}" register: develop_sync_result tags:
- data
- name: Show sync result
debug:
msg: "{{ item.stdout_lines }}"
loop: "{{ develop_sync_result.results }}"
tags:
- data
## Template for loop from ansible_play_hosts and register with hostvars
``` yml=
---
- name: Get cluster name
hosts: all
gather_facts: true
vars:
cluster_cn_name: /root/foo/cn_name.txt
default_owner: foo
default_group: foo
tasks:
- name: Read contents of file
slurp:
src: "{{ cluster_cn_name }}"
register: control_host
- name: Print contents of file
debug:
msg: "{{ control_host['content'] | b64decode }}"
- name: Generate host bom txt
template:
src: ../templates/ha_cluster_cn_name_bom.txt.j2
dest: "{{ project_root }}/{{ project_env }}/data/ha/cluster_cn_name_bom.txt"
owner: "{{ default_owner }}"
group: "{{ default_group }}"
delegate_to: localhost
ha_cluster_cn_name_bom.txt.j2
{% for play_hostname in ansible_play_hosts %}
{{ play_hostname }} {{ hostvars[play_hostname].control_host['content'] | b64decode | trim }}
{% endfor %}
generate the known_host on the control node
---
- name: Generate SSH keys and add to known_hosts
hosts: all # Replace with the target hosts you want to add to the known_hosts file
gather_facts: false
tasks:
- name: Add public key to known_hosts
shell: ssh-keyscan {{ hostvars[inventory_hostname].ansible_host }}
register: keyscan_output
delegate_to: localhost
- name: Ensure unique entries in known_hosts
ansible.builtin.lineinfile:
path: ~/.ssh/known_hosts
line: "{{ item }}"
create: yes
loop: "{{ keyscan_output.stdout_lines }}"
when: keyscan_output.stdout_lines is defined
delegate_to: localhost
copy files with wildcard
get ip address from inventory group
- The extract filter is used to map from a list of indices to a list of values from a container (hash or array)
- jinja2 - Ansible: Get all the IP addresses of a group - Stack Overflow
block and condition with group
- hosts: all
tasks:
- name: Install packages for Redhat
block:
- name: Installation for all nodes
ansible.builtin.dnf:
name:
- munge
- slurm
- slurm-contribs
- slurm-perlapi
state: latest
- name: Installation for login nodes
ansible.builtin.dnf:
name:
- slurm-devel
- slurm-pmi
- slurm-pmi-devel
state: latest
when: "'loginNode' in group_names"
- name: Installation for control nodes
ansible.builtin.dnf:
name:
- slurm-slurmctld
state: latest
when: "'controlNode' in group_names"
- name: Install the singularity ce
# https://docs.sylabs.io/guides/latest/admin-guide/installation.html#install-from-provided-rpm-deb-packages
ansible.builtin.dnf:
name: 'https://github.com/sylabs/singularity/releases/download/v{{ options.singularity_ce_version }}/singularity-ce-{{ options.singularity_ce_version }}-1.el8.x86_64.rpm'
state: present
disable_gpg_check: True
when: "'databaseNode' in group_names"
- name: Installation for database nodes
ansible.builtin.dnf:
name:
- mariadb-server
- slurm-slurmdbd
state: latest
when: "'databaseNode' in group_names"
- name: Installation for compute nodes
ansible.builtin.dnf:
name:
- slurm-slurmd
- slurm-pmi
state: latest
when: "'computeNode' in group_names or 'acceleratorNode' in group_names"
when: (ansible_distribution == "Rocky" or ansible_distribution == "RedHat") and ansible_distribution_major_version == "8"