Skip to content

Ansible Practice

install network manager package and add the user devops_user

``` yml=

  • name: Display installed packages hosts: web tasks:

    • name: Install the package apt: name:
      • apache2=2.4.41-4ubuntu3.12
      • network-manager
    • name: Gather info on installed packages package_facts: manager: auto

    • name: List installed packages debug: var: ansible_facts.packages

    • name: Display NetworkManager version debug: msg: "Version {{ansible_facts.packages['network-manager'][0].version}}" when: "'network-manager' in ansible_facts.packages"

    • name: New user devops hosts: web tasks:
    • name: Verify that auditors group exists group: name: "{{ item }}" state: present loop:
      • sys_admins
      • developers
    • name: Add new user to the development machine and assign the appropriate groups. user: name: devops_user shell: /bin/bash groups: sys_admins,developers append: yes generate_ssh_key: yes ssh_key_bits: 2048 ssh_key_file: .ssh/id_my_rsa
      ## customized tool installation
      group_vars/all.yml
      
      ``` yml
      env=production
      installation_folder_dir: /tmp/sumtool
      sumtool_folder: "{{ installation_folder_dir }}/sum_2.9.0_Linux_x86_64"
      sumtool_url: https://www.supermicro.com/Bios/sw_download/466/sum_2.9.0_Linux_x86_64_20220804.tar.gz
      cluster_bmc_credential: "/foo/{{ env }}/config/cluster-bmc-credential.txt"
      

playbook.yml

- name: create `temp-installation` folder
  file:
    path: "{{ installation_folder_dir }}"
    state: directory

- name: check sumtool directory exists
  stat:
    path: "{{ sumtool_folder }}"
  register: sum_command

- name: download the Supermicro SUM tool
  get_url:
    url: "{{ sumtool_url }}"
    dest: "{{ installation_folder_dir }}"
  when: sum_command.stat.exists == False

- name: uncompress the tar file
  unarchive:
    src: "{{ installation_folder_dir }}/{{ sumtool_url | basename}}"
    dest: "{{ installation_folder_dir }}"
    remote_src: yes
    list_files: yes
  register: unzip_contents

- name: set variable
  set_fact:
    sumtool_dir: "{{ installation_folder_dir }}/{{ unzip_contents.files[0] }}"
    cpu_series: "{{ 7002 if ansible_processor[2] | regex_search('7[0-9][0-9]2') is not none else 7003}}"

- name: show cpu_series
  debug:
    msg: 
      - "{{ cpu_series }}"
      - "{{ ansible_processor[2] }}"

- name: check python-lxml is installed
  shell: pip3 list | grep lxml
  register: pip3_list
  ignore_errors: true

# NOTE: To use ansible xml, it's necessary to install python3 and python3-lxml.
- name: install python-lxml
  pip:
    name: lxml
  when: pip3_list.rc != 0
- name: get ipmi ip
  shell: |
      cat "{{cluster_bmc_credential}}" | grep "{{target_host}}" | awk '{split($4,a,"/");print a[2]}'
  register: ipmi_ip
- name: get ipmi password
  shell: |
      cat "{{cluster_bmc_credential}}" | grep "{{target_host}}" | awk '{print $6}'
  register: ipmi_password
- name: get the current BIOS configurations
  shell:
    chdir: "{{ sumtool_dir }}"
    cmd: "./sum -i {{ipmi_ip.stdout}} -u ADMIN -p {{ipmi_password.stdout}} -c GetCurrentBiosCfg --overwrite --file {{ sumtool_dir }}/{{original_bios_config_file}}"
  ignore_errors: true
  become: true

install ansible

  - name: Install the package group
    ansible.builtin.yum:
      name: "{{ item }}"
      state: present
    become: yes
    loop:
      - "@Infiniband Support"
      - "@Performance Tools"
      - "@Development Tools" 
      - "@Scientific Support"
      - "@System Tools"
      - python39
    when: ansible_distribution=="RedHat"
  - include_role:
      name: ipmitool_package
  - name: Install ansible
    ansible.builtin.pip:
      name: ansible-core==2.14
      executable: pip3.9

check sw status and use template

---
- name: Validate Cluster Preparation
  hosts: cluster_sn,cluster_ln,cluster_an
  vars:
    default_owner: kapl
    default_group: kapl
  tasks:
  - name: Node Type
    set_fact:
      node_type: "{{ group_names[1].split(\"_\")[1] }}"
  - name: Debug Node Type
    debug:
      msg: "node type: {{ node_type }}"
  - name: Passwordless - root - authorized_keys
    stat:
      path: /root/.ssh/authorized_keys
    register: passwordless_root_authorized_keys
  - name: Passwordless - root - cn001 public key
    shell: grep 'root@k22cn001' /root/.ssh/authorized_keys
    register: passwordless_root_control_node_1
  - name: Passwordless - root - cn002 public key
    shell: grep 'root@k22cn002' /root/.ssh/authorized_keys
    register: passwordless_root_control_node_2
  - name: Passwordless - kapl - authorized_keys
    stat:
      path: /home/kapl/.ssh/authorized_keys
    register: passwordless_kapl_authorized_keys
  - name: Passwordless - kapl - cn001 public key
    shell: grep 'kapl@k22cn001' /home/kapl/.ssh/authorized_keys
    register: passwordless_kapl_control_node_1
  - name: Passwordless - kapl - cn002 public key
    shell: grep 'kapl@k22cn002' /home/kapl/.ssh/authorized_keys
    register: passwordless_kapl_control_node_2
  - name: Passwordless - status
    set_fact:
      passwordless_status: 
        control_node_1:
          root: "{{ passwordless_root_authorized_keys.stat.exists and passwordless_root_control_node_1.rc == 0 }}"
          kapl: "{{ passwordless_kapl_authorized_keys.stat.exists and passwordless_kapl_control_node_1.rc == 0 }}"
        control_node_2:
          root: "{{  passwordless_root_authorized_keys.stat.exists and passwordless_root_control_node_2.rc == 0 }}"
          kapl: "{{  passwordless_kapl_authorized_keys.stat.exists and passwordless_kapl_control_node_2.rc == 0 }}"
  - name: Debug Passwordless - kapl
    debug:
      var: passwordless_status
  - name: Get CPU SMT
    shell: cat /sys/devices/system/cpu/smt/active
    register: cpu_smt
  - name: CPU - SMT status
    set_fact:
      cpu_smt_status: "{% if (cpu_smt.stdout == '1' ) %}true{% else %}false{% endif %}"
  - name: Fetch interface names
    debug:
      msg: "{{ item }}"
    loop: "{{ ansible_facts.interfaces | select('match', '^enp[0-9]+s0f0$')}}"
  - name:  Fetch interface and set 
    set_fact:
      network_10g_interface_name: "{{ item }}"
    loop: "{{ ansible_facts.interfaces | select('match', '^enp[0-9]+s0f0$')}}"
  - name: Set fact for network - 10g
    set_fact:
      network_10g:
        name: "{{ network_10g_interface_name }}"
        mtu: "{{ ansible_facts[network_10g_interface_name].mtu }}"
        speed: "{{ ansible_facts[network_10g_interface_name].speed  }}"
        driver: {}
        firmware: {}
      network_100g:
        name: ib0
        mtu: "{{ ansible_facts['ib0'].mtu }}"
        speed: "{{ ansible_facts['ib0'].speed  }}"
        driver: {}
        firmware: {}
  - name: Get network driver - 10g
    shell: ethtool -i {{ network_10g.name }} | grep version | head -n 1 | awk '{print $2}'
    register: network_10g_driver
  - name: Get network firmware - 10g
    shell: ethtool -i {{ network_10g.name }} | grep firmware-version | awk '{print $3}'
    register: network_10g_firmware
  - name: Get network driver - 100g
    shell: ethtool -i {{ network_100g.name }} | grep version | head -n 1 | awk '{print $2}'
    register: network_100g_driver
  - name: Get network firmware - 100g
    shell: ethtool -i {{ network_100g.name }} | grep firmware-version | awk '{print $2}'
    register: network_100g_firmware
  - name: Update network driver, firmware
    ansible.utils.update_fact:
      updates:
      - path: network_10g.driver.version
        value: "{{ network_10g_driver.stdout }}"
      - path: network_10g.firmware.version
        value: "{{ network_10g_firmware.stdout }}"
      - path: network_100g.driver.version
        value: "{{ network_100g_driver.stdout }}"
      - path: network_100g.firmware.version
        value: "{{ network_100g_firmware.stdout }}"
    register: updated
  - name: Update fact for network
    set_fact:
      network_10g: "{{ updated.network_10g }}"
      network_100g: "{{ updated.network_100g }}"
  - name: Debug network variables
    debug:
      msg: 
      - "{{ network_10g }}"
      - "{{ network_100g }}"
  - name: Get subscription info
    shell: subscription-manager status
    register: subscription_info
    become: yes
  - name: set_status either to True or False
    set_fact:
      subscription_status: "{% if (subscription_info.stdout_lines | regex_search('Overall Status: Current')) %}true{% else %}false{% endif %}"
  - name: Benchmark - HPL code drop
    stat:
      path: /tmp/Ansible_HPL/install_packages/amd-zen-hpl-avx2-2023_01.tar
    register: hpl_code_drop
  - name: Benchmark - HPL prebuilt env
    stat:
      path: /AMD-HPL
    register: hpl_prebuilt_env
  - name: Benchmark - HPL status
    set_fact:
      benchmark_hpl_status: "{% if (hpl_code_drop.stat.exists and hpl_prebuilt_env.stat.exists ) %}true{% else %}false{% endif %}"
  - name: Benchmark - MPI spack
    shell: source /tmp/spack/share/spack/setup-env.sh; spack --version
    register: mpi_spack
    ignore_errors: yes
  - name: Benchmark - MPI status
    set_fact:
      benchmark_mpi_status: "{% if (mpi_spack.rc == 0 ) %}true{% else %}false{% endif %}"
      benchmark_mpi_spack_version: "{{ mpi_spack.stdout }}"
  - name: Benchmark - SPECRate install directory
    stat:
      path: /tmp/Ansible_SPEC-CPU-2017/cpu2017
    register: spec_rate_install_directory
  - name: Benchmark - SPECRate config
    stat:
      path: /tmp/Ansible_SPEC-CPU-2017/install_packages/config/spec-cpu_fprate_amd-7742_2_aocc_320_linux_x86.tar.xz
    register: spec_rate_config
  - name: Benchmark - SPECRate status
    set_fact:
      benchmark_spec_rate_status: "{% if (spec_rate_install_directory.stat.exists and spec_rate_config.stat.exists ) %}true{% else %}false{% endif %}"
  - name: Benchmark - SPECAccel install directory
    stat:
      path: /tmp/Ansible_SPEC-ACCEL/accel
    register: spec_accel_install_directory
  - name: Benchmark - SPECAccel config
    stat:
      path: /tmp/Ansible_SPEC-ACCEL/install_packages/config/spec-accel_openacc_amd-7742_2_aocc_320_linux_x86.cfg
    register: spec_accel_config
  - name: Benchmark - SPECAccel flag
    stat:
      path: /tmp/Ansible_SPEC-ACCEL/install_packages/config/nvidia_flags.xml
    register: spec_accel_flag
  - name: Benchmark - SPECAccel status
    set_fact:
      benchmark_spec_accel_status: "{% if (spec_accel_install_directory.stat.exists and spec_accel_config.stat.exists and spec_accel_flag.stat.exists ) %}true{% else %}false{% endif %}"
  - name: Benchmark - iperf3
    shell: rpm -qa | grep iperf3
    register: iperf3_installed
    ignore_errors: yes
    when: ansible_os_family == "RedHat"
  - name: Benchmark - iperf3 status
    set_fact:
      benchmark_iperf3_status: "{% if ( iperf3_installed.rc == 0  ) %}true{% else %}false{% endif %}"
  - name: Mount - /opt_shared
    shell: mount | grep /opt_shared | awk '{print $1, $2, $3, $4, $5}'
    register: mount_opt_shared_result
  - name: Mount - /opt_shared info
    set_fact:
      nfs_client:
        mount_opt_shared_status: "{{ mount_opt_shared_result.rc == 0 and 'nfs' in mount_opt_shared_result.stdout }}"
        mount_opt_shared_info: "{{ mount_opt_shared_result.stdout }}"
  - name: lsblk info
    shell: lsblk
    register: lsblk_result
  - name: df -hT info
    shell: df -hT
    register: df_result
  - name: Fetch disk name and set
    set_fact:
      disk_names: "{{ ansible_facts.devices.keys() | select('match', '^nvme[0-9]{1}n[0-9]{1}') | map('regex_replace', '^(.*)$', '/dev/\\1') | list }}"
  - name: Debug disk names
    debug:
      var: disk_names
  - name: smartctl info
    shell: "smartctl --all {{ item }}"
    loop: "{{ disk_names }}"
    register: smartctl_result
  - name: Debug directory for data file
    debug:
      msg: 
      - "{{ cluster_node_bom_host }}/{{ inventory_hostname }}"
  - name: Create directory for data file
    file:
      path: "{{ item }}"
      state: directory
      owner: "{{ default_owner }}"
      group: "{{ default_group }}"
      mode: '0777'
    loop:
      - "{{ cluster_node_bom_host }}/{{ inventory_hostname }}"
  - name: Generate host bom txt
    template:
      src: ../templates/host_bom.txt.j2
      dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/{{ node_type }}-bom.txt"
      owner: "{{ default_owner }}"
      group: "{{ default_group }}"
  - name: Generate host bom json
    template:
      src: ../templates/host_bom.json.j2
      dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/{{ node_type }}-bom.json"
      owner: "{{ default_owner }}"
      group: "{{ default_group }}"
  - name: Generate lsblk file
    template:
      src: ../templates/host_lsblk.txt.j2
      dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/lsblk.txt"
      owner: "{{ default_owner }}"
      group: "{{ default_group }}"
  - name: Generate df file
    template:
      src: ../templates/host_df.txt.j2
      dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/df.txt"
      owner: "{{ default_owner }}"
      group: "{{ default_group }}"
  - name: Generate smartctl file
    template:
      src: ../templates/host_smartctl.txt.j2
      dest: "{{ cluster_node_bom_host }}/{{ inventory_hostname }}/smartctl.txt"
      owner: "{{ default_owner }}"
      group: "{{ default_group }}"

host_bom.json.j2

{
    "os_version": "{{ ansible_distribution_version }}",
    "kernel_version": "{{ ansible_kernel }}",
    "cpu": {
        "smt": {{ cpu_smt_status | to_json }}
    }
    "network": {
        "10g": {{ network_10g | to_json }},
        "100g": {{ network_100g | to_json }}
        }
    },
    "license": {{ subscription_status | to_json }},
    "benchmark": {
        "hpl": {"installed": {{ benchmark_hpl_status | to_json }} },
        "mpi": {"installed": {{ benchmark_mpi_status | to_json }} },
        "spec_rate": {"installed": {{ benchmark_spec_rate_status | to_json }} },
        "spec_accel": {"installed": {{ benchmark_spec_accel_status | to_json }} },
        "iperf3": {"installed": {{ benchmark_iperf3_status | to_json }} }
    },
    "passwordless": {{ passwordless_status | to_json }},
    "nfs_client": {{ nfs_client | to_json }}
}

host_bom.txt.j2

##! Title: Get Node Bom {{ node_type | upper }} ({{ inventory_hostname }})
##! Priority: 1
##! TimeStamp: {{ "%m%d:%H%M" | strftime(ansible_date_time.epoch) }}
{{ "%-35s%-30s%-30s" | format("Name", "Value", "Notes") }}
---------------------------------------------------------------------
{{ "%-35s%-30s%-30s" | format("OS-Version", ansible_distribution_version, "-") }}
{{ "%-35s%-30s%-30s" | format("Kernel-Version", ansible_kernel, "-") }}
{{ "%-35s%-30s%-30s" | format("CPU-SMT", cpu_smt_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-Name", network_10g.name, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-Driver-Version", network_10g.driver.version, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-FW-Version", network_10g.firmware.version, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-MTU", network_10g.mtu, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-10G-Speed", network_10g.speed, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-Name", network_100g.name, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-Driver-Version", network_100g.driver.version, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-FW-Version", network_100g.firmware.version, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-MTU", network_100g.mtu, "-") }}
{{ "%-35s%-30s%-30s" | format("Net-100G-Speed", network_100g.speed, "-") }}
{{ "%-35s%-30s%-30s" | format("License-Registered", subscription_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-HPL-Installed", benchmark_hpl_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-MPI-Installed", benchmark_mpi_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-SPECRate-Installed", benchmark_spec_rate_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-SPECAccel-Installed", benchmark_spec_accel_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Benchmarks-Iperf-Installed", benchmark_iperf3_status, "-") }}
{{ "%-35s%-30s%-30s" | format("Passwordless-cn001-root", passwordless_status.control_node_1.root, "-") }}
{{ "%-35s%-30s%-30s" | format("Passwordless-cn001-kapl", passwordless_status.control_node_1.kapl, "-") }}
{{ "%-35s%-30s%-30s" | format("Passwordless-cn002-root", passwordless_status.control_node_2.root, "-") }}
{{ "%-35s%-30s%-30s" | format("Passwordless-cn002-kapl", passwordless_status.control_node_2.kapl, "-") }}
{{ "%-35s%-30s%-30s" | format("NFS-Client-Mount", nfs_client.mount_opt_shared_status, nfs_client.mount_opt_shared_info) }}

host_smartctl.txt.j2

------------------------------------------------------------------
{{ smartctl_result.results[0].item }}
------------------------------------------------------------------
{{ smartctl_result.results[0].stdout }}


------------------------------------------------------------------
{{ smartctl_result.results[1].item }}
------------------------------------------------------------------
{{ smartctl_result.results[1].stdout }}

cron

```yml=

  • name: Control Node Crontab setting hosts: cluster_cn vars:
    • root_dir: "{{ project_root }}/{{ project_env }}"
    • primary_control_node: k22cn001
    • stop_cron_job: false become: yes tasks:
  • name: set_fact set_fact: stop_cron_job: true when: "inventory_hostname != primary_control_node and primary_control_node in ansible_play_hosts"
  • name: check stop_cron_job debug: var: stop_cron_job
  • name: Creates an entry like SMC_ROOT_DIR on top of crontab ansible.builtin.cron: name: SMC_ROOT_DIR user: root env: yes job: "{{ root_dir }}"
  • name: Creates an entry like CRON_ENV to indicate if invaking by crontab ansible.builtin.cron: name: CRON_ENV user: root env: yes job: 1
  • name: Cron for root user - fetch pdu power ansible.builtin.cron: name: "fetch the metrics about PDU power" user: root minute: "1-59" job: "${SMC_ROOT_DIR}/bin/pdu-power/fetch-metrics-multiple-node.sh pdu" disabled: "{{stop_cron_job}}"
  • name: Cron for root user - retention pdu power ansible.builtin.cron: name: "retention the metrics about PDU power" user: root hour: "1" minute: "0" job: "${SMC_ROOT_DIR}/bin/pdu-power/retention-metrics-multiple-node.sh pdu" disabled: "{{stop_cron_job}}"
  • name: Creates an entry like SMC_ROOT_DIR on top of crontab ansible.builtin.cron: name: SMC_ROOT_DIR user: kapl env: yes job: "{{root_dir}}"
  • name: Cron for kapl user - txt to csv ansible.builtin.cron: name: "transfer txt to csv" user: kapl minute: "2-59/5" job: "${SMC_ROOT_DIR}/bin/supports-misc/403-cron-txt-to-csv.sh.x" disabled: "{{stop_cron_job}}"
  • name: Cron for kapl user - csv to html ansible.builtin.cron: name: "transfer csv to html" user: kapl minute: "3-59/5" job: "${SMC_ROOT_DIR}/bin/supports-misc/404-cron-csv-to-html.sh.x" disabled: "{{ stop_cron_job }}"
  • name: Cron for kapl user - csv to json ansible.builtin.cron: name: "transfer csv to json" user: kapl minute: "4-59/5" job: "${SMC_ROOT_DIR}/bin/supports-misc/405-cron-csv-to-json.sh.x" disabled: "{{ stop_cron_job }}"
  • name: Compute Node Crontab setting hosts: cluster_sn, cluster_ln, cluster_an vars:
    • root_dir: "{{ project_root }}/{{ project_env }}" become: yes gather_facts: no tasks:
  • name: Creates an entry like SMC_ROOT_DIR on top of crontab ansible.builtin.cron: name: SMC_ROOT_DIR user: root env: yes job: "{{ root_dir }}"
  • name: Cron for root user - fetch power temparature ansible.builtin.cron: name: "fetch the metrics about Power Temparature" user: root minute: "1-59" job: "${SMC_ROOT_DIR}/bin/pdu-power/fetch-metrics-single-node.sh"
  • name: Cron for root user - retention power temparature ansible.builtin.cron: name: "retention the metrics about Power Temparature" user: root hour: "1" minute: "0" job: "${SMC_ROOT_DIR}/bin/pdu-power/retention-metrics-single-node.sh"
## nagios
### install

``` yml
---
- name: Install Nagios Core and Plugins
  hosts: cluster_cn
  vars:
    core_version: 4.4.8
    plugins_version: 2.4.0
    core_install_dir: /tmp/nagios-{{core_version}}
    plugins_install_dir: /tmp/nagios-plugins-release-{{ plugins_version }}
    user_file: /usr/local/nagios/etc/htpasswd.users
  vars_files:
    - ../../vars/rhel.yml
  become: yes
  tasks:
  - name: Reset project root
    set_fact:
      project_root: /opt_shared-master/smci/kapl
    when: inventory_hostname == 'k22cn002'
  - name: Debug
    debug:
      msg: "{{ ansible_distribution_major_version }} {{ project_root }}"
  - name: Disale SELINUX
    shell: sed -i 's/^SELINUX=.*$/SELINUX=disabled/' /etc/selinux/config
  - name: Install Nagios Prerequisites
    yum:
      name:
        - gcc
        - glibc
        - glibc-common
        - perl
        - httpd
        - php
        - wget
        - gd
        - gd-devel
        - openssl-devel
      state: present
    when: ansible_distribution == "RedHat" and ansible_distribution_major_version == "8"
  # https://support.nagios.com/kb/article.php?id=569#RHEL
  - name: Install epel rpm from a local file
    yum:
      name: "{{ project_repo }}/linux/rpm/epel-release-latest-8.noarch.rpm"
      state: present
  - name: Enable repo
    shell: subscription-manager repos --enable "codeready-builder-for-rhel-8-x86_64-rpms"
  - name: Install Nagios Plugins Prerequisites
    yum:
      name:
        - make
        - gettext
        - automake
        - autoconf
        - net-snmp
        - net-snmp-utils
        - perl-Net-SNMP
        - fping
        - lm_sensors
      state: present
    when: ansible_distribution == "RedHat" and ansible_distribution_major_version == "8"
  - name: Get rid of semi-created installation dir
    file:
      path: "{{ item }}"
      state: absent
    loop:
      - "{{ core_install_dir }}"
      - "{{ plugins_install_dir }}"
  - name: Copy core file
    copy:
      src: "{{ project_repo }}/linux/tar-gz/nagios-{{core_version}}.tar.gz"
      dest: "{{ core_install_dir }}.tar.gz"
  - name: Copy plugins file
    copy:
      src: "{{ project_repo }}/linux/tar-gz/nagios-plugins-release-{{ plugins_version }}.tar.gz"
      dest: "{{ plugins_install_dir }}.tar.gz"
  - name: Create directory for uncompressed
    file:
      path: "{{ item }}"
      state: directory
      mode: '0777'
    loop:
      - "{{ core_install_dir }}"
      - "{{ plugins_install_dir }}"
      - "{{ project_log }}/nagios"
  - name: Extract Nagios Core
    unarchive:
      src: "{{ core_install_dir }}.tar.gz"
      dest: "{{ core_install_dir }}"
      extra_opts:
        - --strip-components=1
  - name: Extract Nagios Pligins
    unarchive:
      src: "{{ plugins_install_dir }}.tar.gz"
      dest: "{{ plugins_install_dir }}"
      extra_opts:
        - --strip-components=1
  - name: Compile
    shell: "{{ item }} >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-compile.log"
    args:
      chdir: "{{ core_install_dir }}"
    loop:
      - ./configure
      - make all
  - name: Create User And Group
    shell: "{{ item }} >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install-groups-users.log"
    args:
      chdir: "{{ core_install_dir }}"
    loop:
      - make install-groups-users
      - usermod -a -G nagios apache
  - name: Install Binaries, Service / Daemon
    shell: "{{ item }} >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install.log"
    args:
      chdir: "{{ core_install_dir }}"
    loop:
      - make install
      - make install-daemoninit
  - name: Install Command Mode
    shell: "make install-commandmode >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install-commandmode.log"
    args:
      chdir: "{{ core_install_dir }}"
  - name: Install Configuration Files
    shell: "make install-config >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install-config.log"
    args:
      chdir: "{{ core_install_dir }}"
  - name: Install Apache Config Files
    shell: "make install-webconf >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-install-webconf.log"
    args:
      chdir: "{{ core_install_dir }}"
  - name: Configure Firewall for Nagios Core web interface
    shell: "{{ item }}"
    loop:
      - firewall-cmd --zone=public --add-port=80/tcp
      - firewall-cmd --zone=public --add-port=80/tcp --permanent
  - name: Get rid of nagios pass file
    file:
      path: "{{ user_file }}"
      state: absent
  - name: Create nagiosadmin User Account
    shell: "htpasswd -b -c {{ user_file }} nagiosadmin {{ kapl_password }}"
  - name: Create kapl User Account
    shell: "htpasswd -b {{ user_file }} {{kapl_username}} {{ kapl_password }}"
  - name: Plugins - Compile 
    shell: "{{ item }} >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-plugins-compile.log"
    args:
      chdir: "{{ plugins_install_dir }}"
    loop:
      - ./tools/setup
      - ./configure
      - make
  - name: Plugins - Install 
    shell: "make install >> {{ project_log }}/nagios/nagios-{{ inventory_hostname }}-plugins-install.log"
    args:
      chdir: "{{ plugins_install_dir }}"
  - name: Plugins - Verify if installed
    stat:
      path: /usr/local/nagios/libexec/check_ping
    register: plugins_installed
  - name: Debug Plugins - Verify if installed
    debug:
      msg: "Plugins are installed: {{ plugins_installed.stat.exists }}"
  - name: run and enable systemd service
    service:
      name: "{{ item }}"
      state: restarted
      enabled: yes
    become: yes
    loop:
      - httpd
      - nagios

config

---
- name: Config Nagios Core and Plugins
  hosts: cluster_cn
  vars:
    config_dir: /usr/local/nagios/etc
  vars_files:
    - ../../vars/rhel.yml
  become: yes
  tasks:
  - name: Reset project root
    set_fact:
      project_root: /opt_shared-master/smci/kapl
    when: inventory_hostname == 'k22cn002'
  - name: Debug
    debug:
      msg: "{{ groups['cluster_cn'] | join(',') }} {{ project_root }}"
  - name: Check user privilege
    shell: "grep 'nagiosadmin,{{ kapl_username }}' {{ config_dir }}/cgi.cfg"
    register: check_user_priv
  - name: Config users privilege
    replace:
      path: "{{ config_dir }}/cgi.cfg"
      regexp: 'nagiosadmin'
      replace: "nagiosadmin,{{ kapl_username }}"
      backup: no
    when: check_user_priv.rc == 1
  - name: Ensure swiches configuration dir
    lineinfile:
      path: "{{ config_dir }}/nagios.cfg"
      regexp: "^cfg_dir={{ config_dir }}/switches"
      insertafter: "^#cfg_dir={{ config_dir }}/switches"
      line: "cfg_dir={{ config_dir }}/switches"
  - name: Ensure server configuration dir
    lineinfile:
      path: "{{ config_dir }}/nagios.cfg"
      regexp: "^cfg_dir={{ config_dir }}/servers"
      insertafter: "^#cfg_dir={{ config_dir }}/servers"
      line: "cfg_dir={{ config_dir }}/servers"
  - name: Ensure pdu configuration dir
    lineinfile:
      path: "{{ config_dir }}/nagios.cfg"
      regexp: "^cfg_dir={{ config_dir }}/routers"
      insertafter: "^#cfg_dir={{ config_dir }}/routers"
      line: "cfg_dir={{ config_dir }}/pdus"
  - name: Create directory for configuration
    file:
      path: "{{ item }}"
      state: directory
      owner: nagios
      group: nagios
      mode: '0755'
    loop:
      - "{{ config_dir }}/servers"
      - "{{ config_dir }}/switches"
      - "{{ config_dir }}/pdus"
  - name: Generate control node config
    template:
      src: ../../templates/nagios/control_node.cfg.j2
      dest: "{{ config_dir }}/servers/{{ inventory_hostname }}.cfg"
      owner: nagios
      group: nagios
  - name: Generate group config
    template:
      src: ../../templates/nagios/group_node_type.cfg.j2
      dest: "{{ config_dir }}/servers/group_node_type.cfg"
      owner: nagios
      group: nagios

control_node.cfg.j2

###############################################################################
# LOCALHOST.CFG - SAMPLE OBJECT CONFIG FILE FOR MONITORING THIS MACHINE
#
#
# NOTE: This config file is intended to serve as an *extremely* simple
#       example of how you can create configuration entries to monitor
#       the local (Linux) machine.
#
###############################################################################



###############################################################################
#
# HOST DEFINITION
#
###############################################################################

# Define a host for the local machine

define host {

    use                     linux-server            ; Name of host template to use
                                                    ; This host definition will inherit all variables that are defined
                                                    ; in (or inherited by) the linux-server host template definition.
    host_name               {{ inventory_hostname }}
    alias                   {{ inventory_hostname }}
    address                 {{ hostvars[inventory_hostname]['ansible_host'] }}
}





###############################################################################
#
# SERVICE DEFINITIONS
#
###############################################################################

# Define a service to "ping" the local machine

define service {

    use                     generic-service           ; Name of service template to use
    host_name               {{ inventory_hostname }}
    service_description     PING
    check_command           check_ping!100.0,20%!500.0,60%
}



define service {

    use                     generic-service           ; Name of service template to use
    host_name               {{ inventory_hostname }}
    service_description     SSH
    check_command           check_ssh
    notifications_enabled   0
}

# Define a service to check HTTP on the local machine.
# Disable notifications for this service by default, as not all users may have HTTP enabled.

define service {

    use                     generic-service           ; Name of service template to use
    host_name               {{ inventory_hostname }}
    service_description     HTTP
    check_command           check_http
    notifications_enabled   0
}

group_node_type.cfg.j2

###############################################################################
#
# HOST GROUP DEFINITION
#
###############################################################################

# Define an optional hostgroup for Linux machines

define hostgroup {

    hostgroup_name          cn           ; The name of the hostgroup
    alias                   Control Node           ; Long name of the group
    members                 {{ groups['cluster_cn'] | join(',') }}               ; Comma separated list of hosts that belong to this group
}


define hostgroup {

    hostgroup_name          sn           ; The name of the hostgroup
    alias                   Small Node           ; Long name of the group
    members                 {{ groups['cluster_sn'] | join(',') }}               ; Comma separated list of hosts that belong to this group
}

define hostgroup {

    hostgroup_name          ln           ; The name of the hostgroup
    alias                   Large Node           ; Long name of the group
    members                 {{ groups['cluster_ln'] | join(',') }}               ; Comma separated list of hosts that belong to this group
}

define hostgroup {

    hostgroup_name          an           ; The name of the hostgroup
    alias                   Accelerated Node           ; Long name of the group
    members                 {{ groups['cluster_an'] | join(',') }}               ; Comma separated list of hosts that belong to this group
}

install package and verify network

---
- name: Initial validation for control node
  hosts: cluster_cn
  vars_files:
    - ../vars/rhel.yml
  tasks:
  - name:  Fetch interface and set
    set_fact:
      network_10g_interface_name: "{{ item }}"
    loop: "{{ ansible_facts.interfaces | select('match', '^enp[0-9]+s0f0$')}}"
  - name: set interface MTU 9000
    shell: |
      ifconfig {{ item }} mtu 9000
    loop:
      - "{{ network_10g_interface_name }}"
  - name: set interface MTU 9000 permanent
    lineinfile:
      path: /etc/sysconfig/network-scripts/ifcfg-{{item}}
      line: MTU=9000
      state: present
    loop:
      - "{{ network_10g_interface_name }}"
    become: yes
  - name: Verify foo account and subscription
    include_role:
      name: rhel_account
  - name: Prevent Kernel Upgrades 
    ansible.builtin.lineinfile:
      path: /etc/yum.conf
      line: exclude=kernel* redhat-release* kmod-kvdo
    become: yes
  - name: Allow chronyd works
    ansible.builtin.lineinfile:
      path: /etc/chrony.conf
      line: allow 167.22.10.2
    become: yes
    when: inventory_hostname == primary_control_node
  - name: Set timezone
    community.general.timezone:
      name: America/Los_Angeles
  - name: stop and disable systemd service
    service:
      name: "{{ item }}"
      state: stopped
      enabled: no
    become: yes
    loop:
      - firewalld
    when: inventory_hostname == primary_control_node
  - name: run and enable systemd service
    service:
      name: "{{ item }}"
      state: started
      enabled: yes
    become: yes
    loop:
      - firewalld
    when: inventory_hostname == backup_control_node
  - name: run and enable systemd service
    service:
      name: "{{ item }}"
      state: started
      enabled: yes
    become: yes
    loop:
      - chronyd
      - nfs-server
  - name: check network reachability - internet and dns
    shell: ping -c 2 "{{ item }}"
    loop:
      - 8.8.8.8
      - www.google.com
  - name: check network reachability - 10gb network
    shell: ping -c 2 167.22.10.2 -I enp33s0f0
    become: yes
    register: ping_result
    failed_when: ping_result.rc != 0
    ignore_errors: yes
  - name: check network reachability - infiniband network
    shell: ping -c 2 167.122.10.2 -I ib0
    become: yes
    register: ping_result
    failed_when: ping_result.rc != 0
    ignore_errors: yes
  - name: check network reachability - BMC network
    shell: ping -c 2 167.222.10.2
    register: ping_result
    failed_when: ping_result.rc != 0
    ignore_errors: yes
  - name: check chronyd 
    shell: chronyc tracking
    register: chronyc_tracking
    failed_when: "'Leap status     : Normal' not in chronyc_tracking.stdout"
  - name: Install the package group
    ansible.builtin.yum:
      name: "{{ item }}"
      state: present
      disablerepo: '*'
      enablerepo: local-rhels8.7.0-x86_64--install-rhels8.7.0-x86_64-AppStream,local-rhels8.7.0-x86_64--install-rhels8.7.0-x86_64-BaseOS
    become: yes
    loop:
      - "@Infiniband Support"
      - "@Performance Tools"
      - "@Development Tools" 
      - "@Scientific Support"
      - "@System Tools"
      - python39
      - sshpass
    when: ansible_distribution=="RedHat"
  - include_role:
      name: ipmitool_package
  - name: Install ansible
    ansible.builtin.pip:
      name:
      - ansible-core==2.14
      - python-ipmi==0.5.4
      - ansible-pylibssh==1.1.0 # for cisco switch
      executable: pip3.9
  - name: Install ansible collections
    shell: ansible-galaxy collection install {{ item }}
    loop:
      - community.general
      - ansible.posix
      - ansible.utils
      - cisco.nxos # for cisco switch

Get a list of unreachable hosts in an ansible playbook

``` yml= - name Unreachable servers set_fact: down: "{{ ansible_play_hosts_all | difference(ansible_play_hosts)}}"

## Node Sync 

### sync from cn001 to cn002

Method 1: if execute it on cn001
``` bash
ansible-playbook  playbook/playbook_sync.yml --limit 'cn002' --extra-vars "mode=push"

Method 2: if execute it on cn002 and mode is pull, use sudo

sudo ansible-playbook  playbook/playbook_sync.yml --limit 'cn001' --extra-vars "mode=pull"

sync from cn002 to cn001

Method 1: if execute it on cn001 and mode is pull, use sudo

sudo ansible-playbook  playbook/playbook_sync.yml --limit 'cn002' --extra-vars "mode=pull"

Method 2: if execute it on cn002

ansible-playbook  playbook/playbook_sync.yml --limit 'cn001' --extra-vars "mode=push"

sync only development content like script, config from cn001 to cn002

if execute it on cn002 and mode is pull, use sudo

sudo ansible-playbook  playbook/playbook_sync.yml --tags develop --limit 'cn001' --extra-vars "mode=pull"

sync only ha content like data, reports, logs from cn001 to cn002

if execute it on cn002 and mode is pull, use sudo

sudo ansible-playbook  playbook/playbook_sync.yml --tags data --limit 'cn001' --extra-vars "mode=pull"

``` yml=

  • name: Sync Process hosts: cluster_cn become: yes vars:
    • devlopment_content:
    • bin
    • config
    • scripts
    • data_content:
    • data
    • reports
    • logs tasks:
  • name: Show the sync mode debug: msg: - "{{ mode }} mode" - "push mode from machine which execute ansible to {{ inventory_hostname }}" - "pull mode from {{ inventory_hostname }} to machine which execute ansible"
  • name: Synchronization development content like script, config using rsync protocol ansible.posix.synchronize: mode: "{{ mode }}" src: "{{ project_root }}/{{ project_env }}/{{ item }}/" dest: "{{ project_root }}/{{ project_env }}/{{ item }}/" loop: "{{ devlopment_content }}" register: develop_sync_result tags:
    • develop
  • name: Show sync result debug: msg: "{{ item.stdout_lines }}" loop: "{{ develop_sync_result.results }}" tags:
    • develop
  • name: Synchronization ha content like data, reports, logs using rsync protocol ansible.posix.synchronize: mode: "{{ mode }}" src: "{{ project_root }}/{{ project_env }}/{{ item }}/" dest: "{{ project_root }}/{{ project_env }}/{{ item }}/" rsync_opts:
    • "--min-size=1" loop: "{{ data_content }}" register: develop_sync_result tags:
    • data
  • name: Show sync result debug: msg: "{{ item.stdout_lines }}" loop: "{{ develop_sync_result.results }}" tags:
    • data
## Template for loop from ansible_play_hosts and register with hostvars

``` yml=
---
- name: Get cluster name
  hosts: all
  gather_facts: true
  vars:
    cluster_cn_name: /root/foo/cn_name.txt
    default_owner: foo
    default_group: foo
  tasks:
    - name: Read contents of file
      slurp:
        src: "{{ cluster_cn_name }}"
      register: control_host
    - name: Print contents of file
      debug:
        msg: "{{ control_host['content'] | b64decode }}"
    - name: Generate host bom txt
      template:
        src: ../templates/ha_cluster_cn_name_bom.txt.j2
        dest: "{{ project_root }}/{{ project_env }}/data/ha/cluster_cn_name_bom.txt"
        owner: "{{ default_owner }}"
        group: "{{ default_group }}"
      delegate_to: localhost

ha_cluster_cn_name_bom.txt.j2

{% for play_hostname in ansible_play_hosts %}
{{ play_hostname }} {{ hostvars[play_hostname].control_host['content'] | b64decode | trim }}
{% endfor %}

generate the known_host on the control node

---
- name: Generate SSH keys and add to known_hosts
  hosts: all  # Replace with the target hosts you want to add to the known_hosts file
  gather_facts: false
  tasks:
    - name: Add public key to known_hosts
      shell: ssh-keyscan  {{ hostvars[inventory_hostname].ansible_host }}
      register: keyscan_output
      delegate_to: localhost
    - name: Ensure unique entries in known_hosts
      ansible.builtin.lineinfile:
        path: ~/.ssh/known_hosts
        line: "{{ item }}"
        create: yes
      loop: "{{ keyscan_output.stdout_lines }}"
      when: keyscan_output.stdout_lines is defined
      delegate_to: localhost

copy files with wildcard

get ip address from inventory group

{{ groups['mainnodes'] | map('extract', hostvars, ['ansible_host']) | join(',') }}

block and condition with group

- hosts: all
  tasks:
  - name: Install packages for Redhat
    block:
    - name: Installation for all nodes
      ansible.builtin.dnf:
        name:
          - munge
          - slurm
          - slurm-contribs
          - slurm-perlapi
        state: latest
    - name: Installation for login nodes
      ansible.builtin.dnf:
        name:
          - slurm-devel
          - slurm-pmi
          - slurm-pmi-devel
        state: latest
      when: "'loginNode' in group_names"
    - name: Installation for control nodes
      ansible.builtin.dnf:
        name:
          - slurm-slurmctld
        state: latest
      when: "'controlNode' in group_names"
    - name: Install the singularity ce
      # https://docs.sylabs.io/guides/latest/admin-guide/installation.html#install-from-provided-rpm-deb-packages
      ansible.builtin.dnf:
        name: 'https://github.com/sylabs/singularity/releases/download/v{{ options.singularity_ce_version }}/singularity-ce-{{ options.singularity_ce_version }}-1.el8.x86_64.rpm'
        state: present
        disable_gpg_check: True
      when: "'databaseNode' in group_names"
    - name: Installation for database nodes
      ansible.builtin.dnf:
        name:
          - mariadb-server
          - slurm-slurmdbd
        state: latest
      when: "'databaseNode' in group_names"
    - name: Installation for compute nodes
      ansible.builtin.dnf:
        name:
          - slurm-slurmd
          - slurm-pmi
        state: latest
      when: "'computeNode' in group_names or 'acceleratorNode' in group_names"
    when: (ansible_distribution == "Rocky" or ansible_distribution == "RedHat") and ansible_distribution_major_version == "8"

inventory for localhost

all:
  hosts:
     localhost:
      ansible_connection: local