개요
이전에는 수동으로 도커로 컨테이너 클러스터를 구축하고, 여기에 Hadoop, Zookeeper, Hbase, Spark 모두 설치하였습니다. 너무 따라하기 벅차죠?
이 때 Terraform과 Ansible을 이용하면, 제 코드를 복사하여 손쉽게 구축할 수 있어요!

1. 인프라 구성 개념
이 클러스터는 다음과 같은 컴포넌트로 구성됩니다:
- Master Node (master1): Hadoop NameNode, HBase Master, Spark Master 역할 수행
- Worker Nodes (worker1, worker2): Hadoop DataNode, HBase RegionServer, Spark Worker 역할 수행
2. Terraform을 이용한 인프라 배포
Terraform을 활용하여 클러스터의 가상 머신을 생성하고 네트워크를 설정합니다.
Dockerfile 코드 (terraform/Dockerfiile)
FROM ubuntu:22.04
# 필수 패키지 설치 및 SSH 서버 설정
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get update
RUN apt-get upgrade -y && \
apt-get install -y openssh-server sudo && \
mkdir /var/run/sshd && \
echo "root:password" | chpasswd
# SSH 설정 (비밀번호 인증 허용)
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
# SSH 포트 노출
EXPOSE 22
Terraform 코드 (terraform/main.tf)
terraform {
required_providers {
docker = {
source = "kreuzwerker/docker"
version = "~> 3.0"
}
}
}
provider "docker" {}
# Docker 이미지 빌드 (Dockerfile에서 생성)
resource "docker_image" "custom_ubuntu" {
name = "ubuntu:ssh" # 빌드 후 저장될 이미지 이름
build {
context = "." # 현재 디렉토리 사용
dockerfile = "Dockerfile"
}
}
# 공통 네트워크 생성 (고정 IP 할당을 위해 subnet 지정)
resource "docker_network" "hadoop_network" {
name = "hadoop_network"
driver = "bridge"
ipam_config {
subnet = "172.18.0.0/16"
}
}
# Ubuntu 22.04 컨테이너 생성 (고정 IP 설정)
resource "docker_container" "master1" {
name = "master1"
image = docker_image.custom_ubuntu.image_id
hostname = "master1"
ports {
internal = 2181
external = 2181
protocol = "tcp"
}
ports {
internal = 7077
external = 7077
protocol = "tcp"
}
ports {
internal = 8081
external = 8081
protocol = "tcp"
}
ports {
internal = 8082
external = 8082
protocol = "tcp"
}
ports {
internal = 8083
external = 8083
protocol = "tcp"
}
ports {
internal = 16000
external = 16000
protocol = "tcp"
}
ports {
internal = 16010
external = 16010
protocol = "tcp"
}
ports {
internal = 16020
external = 16020
protocol = "tcp"
}
ports {
internal = 33465
external = 33465
protocol = "tcp"
}
ports {
internal = 4040
external = 14040 # 호스트에서 매핑되는 외부 포트가 다름.
protocol = "tcp"
}
command = ["/bin/bash", "-c", "/usr/sbin/sshd -D & sleep infinity"]
networks_advanced {
name = docker_network.hadoop_network.name
ipv4_address = "172.18.0.2"
}
}
resource "docker_container" "worker1" {
name = "worker1"
image = docker_image.custom_ubuntu.image_id
command = ["/bin/bash", "-c", "/usr/sbin/sshd -D & sleep infinity"]
hostname = "worker1"
networks_advanced {
name = docker_network.hadoop_network.name
ipv4_address = "172.18.0.3"
}
}
resource "docker_container" "worker2" {
name = "worker2"
image = docker_image.custom_ubuntu.image_id
command = ["/bin/bash", "-c", "/usr/sbin/sshd -D & sleep infinity"]
hostname = "worker2"
networks_advanced {
name = docker_network.hadoop_network.name
ipv4_address = "172.18.0.4"
}
}
위 코드를 실행하여 EC2 인스턴스를 생성할 수 있습니다.
terraform init
terraform apply
3. SSH Key 생성 및 배포
Ansible로 클러스터 노드에 비밀번호 없는 SSH 접속을 설정하기 위해 SSH 키를 생성하고 배포합니다.
scripts/generate_ssh_key.sh
#!/bin/bash
# Path to the SSH key
SSH_KEY_PATH="$HOME/.ssh/id_rsa"
# Check if an existing key exists and remove it
if [ -f "$SSH_KEY_PATH" ]; then
echo "[INFO] Existing SSH key found: $SSH_KEY_PATH"
echo "[INFO] Overwriting the existing key."
rm -f "$SSH_KEY_PATH" "$SSH_KEY_PATH.pub"
fi
# Generate a new SSH key (without passphrase)
ssh-keygen -t rsa -b 4096 -f "$SSH_KEY_PATH" -N ""
# Set correct permissions for the key files
chmod 600 "$SSH_KEY_PATH"
# Verify key creation
if [ -f "$SSH_KEY_PATH" ] && [ -f "$SSH_KEY_PATH.pub" ]; then
echo "[SUCCESS] SSH key has been successfully generated!"
echo "[INFO] Public key: $SSH_KEY_PATH.pub"
echo "[INFO] Private key: $SSH_KEY_PATH"
else
echo "[ERROR] SSH key generation failed!"
exit 1
fi
# List of target hosts
HOSTS=("172.18.0.2" "172.18.0.3" "172.18.0.4")
# SSH username
USER="root"
# Iterate over each host and copy the SSH key
for HOST in "${HOSTS[@]}"; do
echo "[INFO] Copying SSH key to $USER@$HOST"
ssh-copy-id -o StrictHostKeyChecking=no "$USER@$HOST"
if [ $? -eq 0 ]; then
echo "[SUCCESS] SSH key copied to $USER@$HOST"
else
echo "[ERROR] Failed to copy SSH key to $USER@$HOST"
fi
done
4. Ansible을 이용한 Hadoop, HBase, Spark 설치
4.1. 하둡 환경 설정을 위한 필요 라이브러리 설치
setup_containers.yml
- name: Set up Hadoop containers
hosts: hadoop
become: yes
tasks:
# - name: Update package lists
# apt:
# update_cache: yes
- name: Upgrade packages
apt:
upgrade: yes
force_apt_get: yes
- name: Install required packages
apt:
name:
- curl
- rsync
- wget
- vim
- iputils-ping
- htop
- openjdk-11-jdk
- net-tools
- libxml2-dev
- libxslt1-dev
- python3-dev
- gcc
state: present
update_cache: yes
- name: Install Python
apt:
name: python3-pip
state: present
- name: Install packaging library
pip:
name: packaging
state: present
- name: Install lxml using pip
pip:
name: lxml
state: present
4.2 Hadoop 설치
setup_hadoop.yml
- name: Set up Hadoop in containers
hosts: hadoop
become: yes
tasks:
- name: Update package lists
apt:
update_cache: yes
- name: Upgrade packages
apt:
upgrade: yes
force_apt_get: yes
- name: Download Hadoop
get_url:
url: http://apache.mirror.cdnetworks.com/hadoop/common/hadoop-3.4.1/hadoop-3.4.1.tar.gz
dest: /root/hadoop-3.4.1.tar.gz
- name: Extract Hadoop
ansible.builtin.unarchive:
src: /root/hadoop-3.4.1.tar.gz
dest: /usr/local/
remote_src: yes
# Rename /usr/local/hadoop-3.4.1 to /usr/local/hadoop
- name: Rename Hadoop directory
command: mv /usr/local/hadoop-3.4.1 /usr/local/hadoop
args:
creates: /usr/local/hadoop
- name: Remove Hadoop archive
file:
path: /root/hadoop-3.4.1.tar.gz
state: absent
- name: Add environment variables to .bashrc
lineinfile:
path: /root/.bashrc
line: "{{ item }}"
state: present
loop:
- 'export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64'
- 'export HADOOP_HOME=/usr/local/hadoop'
- 'export PATH=$PATH:$HADOOP_HOME/bin'
- 'export PATH=$PATH:$HADOOP_HOME/sbin'
- 'export PATH=$PATH:$JAVA_HOME/bin'
- 'export HADOOP_MAPRED_HOME=$HADOOP_HOME'
- 'export HADOOP_COMMON_HOME=$HADOOP_HOME'
- 'export HADOOP_HDFS_HOME=$HADOOP_HOME'
- 'export YARN_HOME=$HADOOP_HOME'
- 'export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native'
- 'export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"'
- name: Source .bashrc
shell: "source /root/.bashrc"
args:
executable: /bin/bash
# core-site.xml
- name: Add fs.defaultFS property to core-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/core-site.xml
xpath: /configuration
add_children:
- property:
name: fs.defaultFS
value: hdfs://master1:8020
# yarn-site.xml
- name: Add YARN properties to yarn-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/yarn-site.xml
xpath: /configuration
add_children:
- property:
name: yarn.nodemanager.aux-services
value: mapreduce_shuffle
- property:
name: yarn.nodemanager.aux-services.mapreduce.shuffle.class
value: org.apache.hadoop.mapred.ShuffleHandler
# mapred-site.xml
- name: Add MapReduce properties to mapred-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/mapred-site.xml
xpath: /configuration
add_children:
- property:
name: mapreduce.framework.name
value: yarn
- property:
name: yarn.app.mapreduce.am.env
value: HADOOP_MAPRED_HOME=$HADOOP_HOME
- property:
name: mapreduce.map.env
value: HADOOP_MAPRED_HOME=$HADOOP_HOME
- property:
name: mapreduce.reduce.env
value: HADOOP_MAPRED_HOME=$HADOOP_HOME
# Ensure the masters file exists
- name: Create masters file if not present
copy:
content: ""
dest: /usr/local/hadoop/etc/hadoop/masters
mode: '0644'
# Ensure 'master1' is present in masters file
- name: Add master1 to masters file
lineinfile:
path: /usr/local/hadoop/etc/hadoop/masters
line: "master1"
state: present
# 10. Ensure 'master1', 'worker1', and 'worker2' are present in workers file
- name: Manage workers file
block:
- lineinfile:
path: /usr/local/hadoop/etc/hadoop/workers
regexp: '^localhost$'
state: absent
- lineinfile:
path: /usr/local/hadoop/etc/hadoop/workers
line: "{{ item }}"
state: present
loop:
- master1
- worker1
- worker2
- name: Setup SSH Key-based Authentication
hosts: all
become: yes
tasks:
- name: Ensure .ssh directory exists
file:
path: ~/.ssh
state: directory
mode: '0700'
- name: Generate SSH key if not exists
command: ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""
args:
creates: ~/.ssh/id_rsa
- name: Ensure public key file exists
stat:
path: ~/.ssh/id_rsa.pub
register: pub_key_file
- name: Fetch public key from each host
shell: cat ~/.ssh/id_rsa.pub
register: ssh_pub_key
changed_when: false
when: pub_key_file.stat.exists
- name: Store SSH public key in hostvars
set_fact:
my_ssh_key: "{{ ssh_pub_key.stdout }}"
- name: Aggregate all SSH keys on localhost
hosts: localhost
gather_facts: no
tasks:
- name: Fetch public key from localhost
shell: cat ~/.ssh/id_rsa.pub
register: local_ssh_key
changed_when: false
- name: Collect all SSH keys from hosts
set_fact:
global_ssh_keys: "{{ groups['all'] | map('extract', hostvars, 'my_ssh_key') | select('defined') | list }}"
- name: Add localhost SSH key to global_ssh_keys
set_fact:
global_ssh_keys: "{{ global_ssh_keys + [local_ssh_key.stdout] }}"
- name: Debug aggregated SSH keys
debug:
var: global_ssh_keys
- name: Convert list to string
set_fact:
ssh_keys: "{{ global_ssh_keys | join('\n') }}"
- name: Deploy SSH Keys to All Hosts
hosts: all
become: yes
tasks:
- name: Ensure authorized_keys file has correct permissions
file:
path: ~/.ssh/authorized_keys
state: touch
mode: '0600'
- name: Ensure authorized_keys contains all SSH keys
copy:
content: "{{ hostvars['localhost']['ssh_keys'] }}"
dest: ~/.ssh/authorized_keys
mode: '0600'
- name: Configure Hadoop on master1
hosts: master1
become: yes
tasks:
- name: Create Hadoop directories
shell: |
cd /usr/local/hadoop
mkdir -p hadoop_tmp/hdfs/namenode
mkdir -p hadoop_tmp/hdfs/datanode
chmod 777 hadoop_tmp/
- name: Update hadoop-env.sh
lineinfile:
path: /usr/local/hadoop/etc/hadoop/hadoop-env.sh
line: "{{ item }}"
insertafter: EOF
loop:
- 'export JAVA_HOME=''/usr/lib/jvm/java-11-openjdk-amd64'''
- 'export HDFS_NAMENODE_USER=root'
- 'export HDFS_DATANODE_USER=root'
- 'export HDFS_SECONDARYNAMENODE_USER=root'
- 'export YARN_NODEMANAGER_USER=root'
- 'export YARN_RESOURCEMANAGER_USER=root'
- name: Add HDFS properties to hdfs-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/hdfs-site.xml
xpath: /configuration
add_children:
- property:
name: dfs.replication
value: "3"
- property:
name: dfs.permissions
value: "false"
- property:
name: dfs.namenode.name.dir
value: /usr/local/hadoop/hadoop_tmp/hdfs/namenode
- property:
name: dfs.datanode.data.dir
value: /usr/local/hadoop/hadoop_tmp/hdfs/datanode
- name: Configure Hadoop on worker nodes
hosts: worker1,worker2
become: yes
tasks:
# 1. Create Hadoop directories
- name: Create Hadoop directories
shell: |
cd /usr/local/hadoop
mkdir -p hadoop_tmp/hdfs/datanode
chmod 777 hadoop_tmp/
args:
creates: /usr/local/hadoop/hadoop_tmp/hdfs/datanode
# 2. Update hadoop-env.sh
- name: Update hadoop-env.sh
lineinfile:
path: /usr/local/hadoop/etc/hadoop/hadoop-env.sh
line: 'export JAVA_HOME=''/usr/lib/jvm/java-11-openjdk-amd64'''
insertafter: EOF
# 3. Ensure <configuration> element exists in hdfs-site.xml
- name: Ensure <configuration> element exists in hdfs-site.xml
xml:
path: /usr/local/hadoop/etc/hadoop/hdfs-site.xml
xpath: /configuration
state: present
# 4. Add HDFS properties using the XML module
- name: Add HDFS properties to hdfs-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/hdfs-site.xml
xpath: /configuration
add_children:
- property:
name: dfs.replication
value: "2"
- property:
name: dfs.permissions
value: "false"
- property:
name: dfs.datanode.data.dir
value: "/usr/local/hadoop/hadoop_tmp/hdfs/datanode"
4.3. Zookeeper 설치 및 구성
setup_zookeeper.yml
- name: Install and Configure ZooKeeper
hosts: all
become: yes
tasks:
- name: Download ZooKeeper
get_url:
url: https://dlcdn.apache.org/zookeeper/zookeeper-3.9.3/apache-zookeeper-3.9.3-bin.tar.gz
dest: /root
mode: '0644'
- name: Extract Zookeeper
ansible.builtin.unarchive:
src: /root/apache-zookeeper-3.9.3-bin.tar.gz
dest: /usr/local/
remote_src: yes
- name: Move ZooKeeper to correct directory
command: mv /usr/local/apache-zookeeper-3.9.3-bin /usr/local/zookeeper
- name: Change ownership of ZooKeeper directory
file:
path: /usr/local/zookeeper
owner: "root"
group: "root"
recurse: yes
- name: Remove Zookeeper archive
file:
path: /root/apache-zookeeper-3.9.3-bin.tar.gz
state: absent
- name: Copy zoo_sample.cfg to zoo.cfg
copy:
content: ""
dest: /usr/local/zookeeper/conf/zoo.cfg
mode: '0644'
- name: Add environment variables to .bashrc
lineinfile:
path: /root/.bashrc
line: "{{ item }}"
state: present
loop:
- 'export ZOOKEEPER_HOME=/usr/local/zookeeper'
- 'export PATH=$PATH:$ZOOKEEPER_HOME/bin'
- name: Configure zoo.cfg
block:
- name: Set dataDir in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "dataDir=/usr/local/zookeeper/data"
state: present
- name: Set clientPort in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "clientPort=2181"
state: present
- name: Set tickTime in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "tickTime=2000"
state: present
- name: Set initLimit in zoo.cfg
lineinfile:
line: "initLimit=10"
path: /usr/local/zookeeper/conf/zoo.cfg
state: present
- name: Set syncLimit in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "syncLimit=5"
state: present
- name: Set server configurations in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "{{ item }}"
state: present
loop:
- "server.1=master1:2888:3888"
- "server.2=worker1:2888:3888"
- "server.3=worker2:2888:3888"
- name: Create data directory for ZooKeeper
file:
path: /usr/local/zookeeper/data
state: directory
mode: '0755'
- name: Ensure data directory exists
file:
path: /usr/local/zookeeper/data
state: directory
mode: '0755'
- name: Set myid for each node
block:
- name: Set myid for master1
copy:
content: "1"
dest: /usr/local/zookeeper/data/myid
mode: '0644'
when: inventory_hostname == "master1"
- name: Set myid for worker1
copy:
content: "2"
dest: /usr/local/zookeeper/data/myid
mode: '0644'
when: inventory_hostname == "worker1"
- name: Set myid for worker2
copy:
content: "3"
dest: /usr/local/zookeeper/data/myid
mode: '0644'
when: inventory_hostname == "worker2"
4.4 HBase 설치
setup_hbase.yml
- name: Set up HBase in containers
hosts: all
become: yes
tasks:
- name: Update package lists
apt:
update_cache: yes
- name: Upgrade packages
apt:
upgrade: yes
force_apt_get: yes
- name: Download HBase
get_url:
url: http://mirror.apache-kr.org/hbase/stable/hbase-2.5.10-bin.tar.gz
dest: /root/hbase-2.5.10-bin.tar.gz
- name: Extract HBase
ansible.builtin.unarchive:
src: /root/hbase-2.5.10-bin.tar.gz
dest: /usr/local/
remote_src: yes
# Rename /usr/local/hbase-2.5.10 to /usr/local/hbase
- name: Rename HBase directory
command: mv /usr/local/hbase-2.5.10 /usr/local/hbase
args:
creates: /usr/local/hbase
- name: Remove HBase archive
file:
path: /root/hbase-2.5.10-bin.tar.gz
state: absent
- name: Configure regionservers
block:
- name: Create regionservers file
copy:
content: ""
dest: /usr/local/hbase/conf/regionservers
mode: '0644'
- name: Set regionservers
lineinfile:
path: /usr/local/hbase/conf/regionservers
line: "{{ item }}"
state: present
loop:
- "worker1"
- "worker2"
- name: Add environment variables to .bashrc
lineinfile:
path: /root/.bashrc
line: "{{ item }}"
state: present
loop:
- 'export HBASE_HOME=/usr/local/hbase'
- 'export HBASE_CONF_DIR=/usr/local/hbase/conf'
- 'export PATH=$PATH:/usr/local/hbase/bin'
- 'export HBASE_LOG_DIR=$HADOOP_LOG_DIR'
- name: Source .bashrc
shell: "source /root/.bashrc"
args:
executable: /bin/bash
- name: Remove hbase-site.xml if it exists
file:
path: /usr/local/hbase/conf/hbase-site.xml
state: absent
- name: Create hbase-site.xml with basic structure if it does not exist
copy:
dest: /usr/local/hbase/conf/hbase-site.xml
content: |
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
</configuration>
mode: '0644'
force: yes # 파일이 이미 존재하면 덮어쓰기
- name: Create and add HBase properties to hbase-site.xml
xml:
pretty_print: yes
path: /usr/local/hbase/conf/hbase-site.xml
state: present
xpath: /configuration
add_children:
- property:
name: hbase.master.hostname
value: master1
- property:
name: hbase.master
value: master1:16000
- property:
name: hbase.wal.provider
value: filesystem
- property:
name: hbase.rest.port
value: "8081"
- property:
name: hbase.regionserver.port
value: "16020"
- property:
name: hbase.rootdir
value: hdfs://master1:8020/hbase
- property:
name: hbase.cluster.distributed
value: "true"
- property:
name: hbase.zookeeper.property.clientPort
value: "2181"
- property:
name: hbase.zookeeper.quorum
value: master1,worker1,worker2
- property:
name: hbase.zookeeper.property.dataDir
value: /usr/local/zookeeper
- property:
name: hbase.rpc.controllerfactory.class
value: org.apache.hadoop.hbase.ipc.RpcControllerFactory
- name: Configure hbase-env.sh
become: yes
blockinfile:
path: /usr/local/hbase/conf/hbase-env.sh
block: |
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
export HBASE_MANAGES_ZK=false
export HBASE_REGIONSERVERS=/usr/local/hbase/conf/regionservers
export HBASE_OPTS="$HBASE_OPTS -Dfile.encoding=UTF-8"
marker: "# {mark} HBase environment settings"
state: present
- name: Configure backup-masters
copy:
content: "worker1"
dest: /usr/local/hbase/conf/backup-masters
mode: '0644'
4.5 Spark 설치
setup_spark.yml
- name: Install and Configure Spark
hosts: all
become: yes
vars:
HBASE_HOME: /usr/local/hbase
JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64
HADOOP_HOME: /usr/local/hadoop
tasks:
- name: Download Spark
get_url:
url: https://dlcdn.apache.org/spark/spark-3.4.4/spark-3.4.4-bin-hadoop3.tgz
dest: /root
mode: '0644'
- name: Extract Spark
ansible.builtin.unarchive:
src: /root/spark-3.4.4-bin-hadoop3.tgz
dest: /usr/local/
remote_src: yes
- name: Move Spark to correct directory
command: mv /usr/local/spark-3.4.4-bin-hadoop3 /usr/local/spark
- name: Remove downloaded Spark archive
file:
path: /root/spark-3.4.4-bin-hadoop3.tgz
state: absent
- name: Change ownership of Spark directory
file:
path: /usr/local/spark
owner: "root"
group: "root"
recurse: yes
- name: Add environment variables to .bashrc
lineinfile:
path: /root/.bashrc
line: "{{ item }}"
state: present
loop:
- 'export SPARK_HOME=/usr/local/spark'
- 'export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin'
- name: Ensure /usr/local/spark/lib directory exists
file:
path: /usr/local/spark/lib
state: directory
mode: '0755'
- name: Copy JAR files from local to remote
copy:
src: "{{ item }}"
dest: /usr/local/spark/lib/
mode: '0644'
with_fileglob:
- "../lib/*.jar"
- name: Copy JAR files from local to remote
copy:
src: "{{ item }}"
dest: /usr/local/hbase/lib/
mode: '0644'
with_fileglob:
- "../lib/*.jar"
- name: Copy spark-env.sh template
command: cp /usr/local/spark/conf/spark-env.sh.template /usr/local/spark/conf/spark-env.sh
- name: Configure spark-env.sh
block:
- name: Set Spark environment variables
lineinfile:
path: /usr/local/spark/conf/spark-env.sh
line: "{{ item }}"
state: present
with_items:
- "export SPARK_MASTER_HOST=master1"
- "export SPARK_MASTER_PORT=7077"
- "export SPARK_MASTER_WEBUI_PORT=8082"
- "export SPARK_WORKER_WEBUI_PORT=8083"
- "export SPARK_WORKER_CORES=1"
- "export SPARK_WORKER_MEMORY=2g"
- "export JAVA_HOME={{ JAVA_HOME }}"
- "export HADOOP_HOME={{ HADOOP_HOME }}"
- "export YARN_CONF_DIR={{ HADOOP_HOME }}/etc/hadoop"
- "export HADOOP_CONF_DIR={{ HADOOP_HOME }}/etc/hadoop"
- "export SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*:{{ HBASE_HOME }}/lib/*"
- name: Copy spark-defaults.conf template
command: cp /usr/local/spark/conf/spark-defaults.conf.template /usr/local/spark/conf/spark-defaults.conf
- name: Configure spark-defaults.conf
block:
- name: Set Spark default packages
lineinfile:
path: /usr/local/spark/conf/spark-defaults.conf
line: "{{ item }}"
state: present
with_items:
- "spark.jars /usr/local/spark/lib/hbase-spark-1.1.0-SNAPSHOT.jar,/usr/local/spark/lib/hbase-spark-protocol-shaded-1.1.0-SNAPSHOT.jar"
- name: Create workers file if it does not exist
command: touch /usr/local/spark/conf/workers
- name: Add workers in workers file
blockinfile:
path: /usr/local/spark/conf/workers
block: |
worker1
worker2
state: present
- name: Configure hbase-env.sh
lineinfile:
path: "{{ HBASE_HOME }}/conf/hbase-env.sh"
line: "export HBASE_CLASSPATH={{ HBASE_HOME }}/lib/scala-library-2.12.17.jar:{{ HBASE_HOME }}/lib/hbase-spark-1.1.0-SNAPSHOT.jar:{{ HBASE_HOME }}/lib/hbase-spark-protocol-shaded-1.1.0-SNAPSHOT.jar"
state: present
insertafter: EOF
5. Hadoop 및 HBase 실행
5.1 Hadoop 실행
start_hadoop.yml
- name: Format HDFS NameNode and start Hadoop services
hosts: master1
become: yes
tasks:
- name: Format HDFS NameNode
environment:
PATH: "{{ ansible_env.PATH }}:/usr/local/hadoop/bin"
shell: hdfs namenode -format
register: format_output
- name: Display format output
debug:
var: format_output.stdout
- name: Start Hadoop services
shell: /usr/local/hadoop/sbin/start-all.sh
register: start_output
- name: Display start output
debug:
var: start_output.stdout
- name: Check JPS
shell: jps
register: jps_output
- name: Display JPS output
debug:
var: jps_output.stdout
ansible-playbook start_hadoop.yml
5.2 Zookeeper실행
start_zookeeper.yml
- name: Start Zookeepers in all nodes
hosts: master1,worker1,worker2
become: yes
tasks:
- name: Start Zookeepers
environment:
PATH: "{{ ansible_env.PATH }}:/usr/local/zookeeper/bin"
shell: zkServer.sh start
register: zk_start_output
- name: Display zk_start output
debug:
var: zk_start_output.stdout
- name: Show Zookeepers status
shell: /usr/local/zookeeper/bin/zkServer.sh status
register: zk_status_output
- name: Display status output
debug:
var: zk_status_output.stdout
- name: Check JPS
shell: jps
register: jps_output
- name: Display JPS output
debug:
var: jps_output.stdout
ansible-playbook start_zookeeper.yml
5.3 HBase 실행
docker exec -it master1 sudo /usr/local/hbase/bin/start-hbase.sh
docker exec -it master1 sudo /usr/local/hbase/bin/hbase-daemon.sh start rest
5.4 Spark Connect 실행
docker exec -it /usr/local/spark/sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:3.4.4
6. 클러스터 제거 (Terraform destroy)
설치한 클러스터를 삭제하려면 다음 스크립트를 실행합니다.
./destroy_all.sh
이 글에서는 Terraform을 활용한 인프라 생성, Ansible을 사용한 Hadoop, HBase, Spark의 자동 설치 및 실행 방법을 다루었습니다. 이를 통해 자동화된 빅데이터 환경을 쉽게 구축할 수 있습니다.

'DevOps' 카테고리의 다른 글
Terraform으로 배포한 Ubuntu 컨테이너를 Ansible로 구성하기 (0) | 2025.02.20 |
---|---|
Terraform으로 도커 컨테이너 구성하기 (0) | 2025.02.04 |
개요
이전에는 수동으로 도커로 컨테이너 클러스터를 구축하고, 여기에 Hadoop, Zookeeper, Hbase, Spark 모두 설치하였습니다. 너무 따라하기 벅차죠?
이 때 Terraform과 Ansible을 이용하면, 제 코드를 복사하여 손쉽게 구축할 수 있어요!

1. 인프라 구성 개념
이 클러스터는 다음과 같은 컴포넌트로 구성됩니다:
- Master Node (master1): Hadoop NameNode, HBase Master, Spark Master 역할 수행
- Worker Nodes (worker1, worker2): Hadoop DataNode, HBase RegionServer, Spark Worker 역할 수행
2. Terraform을 이용한 인프라 배포
Terraform을 활용하여 클러스터의 가상 머신을 생성하고 네트워크를 설정합니다.
Dockerfile 코드 (terraform/Dockerfiile)
FROM ubuntu:22.04
# 필수 패키지 설치 및 SSH 서버 설정
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get update
RUN apt-get upgrade -y && \
apt-get install -y openssh-server sudo && \
mkdir /var/run/sshd && \
echo "root:password" | chpasswd
# SSH 설정 (비밀번호 인증 허용)
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
# SSH 포트 노출
EXPOSE 22
Terraform 코드 (terraform/main.tf)
terraform {
required_providers {
docker = {
source = "kreuzwerker/docker"
version = "~> 3.0"
}
}
}
provider "docker" {}
# Docker 이미지 빌드 (Dockerfile에서 생성)
resource "docker_image" "custom_ubuntu" {
name = "ubuntu:ssh" # 빌드 후 저장될 이미지 이름
build {
context = "." # 현재 디렉토리 사용
dockerfile = "Dockerfile"
}
}
# 공통 네트워크 생성 (고정 IP 할당을 위해 subnet 지정)
resource "docker_network" "hadoop_network" {
name = "hadoop_network"
driver = "bridge"
ipam_config {
subnet = "172.18.0.0/16"
}
}
# Ubuntu 22.04 컨테이너 생성 (고정 IP 설정)
resource "docker_container" "master1" {
name = "master1"
image = docker_image.custom_ubuntu.image_id
hostname = "master1"
ports {
internal = 2181
external = 2181
protocol = "tcp"
}
ports {
internal = 7077
external = 7077
protocol = "tcp"
}
ports {
internal = 8081
external = 8081
protocol = "tcp"
}
ports {
internal = 8082
external = 8082
protocol = "tcp"
}
ports {
internal = 8083
external = 8083
protocol = "tcp"
}
ports {
internal = 16000
external = 16000
protocol = "tcp"
}
ports {
internal = 16010
external = 16010
protocol = "tcp"
}
ports {
internal = 16020
external = 16020
protocol = "tcp"
}
ports {
internal = 33465
external = 33465
protocol = "tcp"
}
ports {
internal = 4040
external = 14040 # 호스트에서 매핑되는 외부 포트가 다름.
protocol = "tcp"
}
command = ["/bin/bash", "-c", "/usr/sbin/sshd -D & sleep infinity"]
networks_advanced {
name = docker_network.hadoop_network.name
ipv4_address = "172.18.0.2"
}
}
resource "docker_container" "worker1" {
name = "worker1"
image = docker_image.custom_ubuntu.image_id
command = ["/bin/bash", "-c", "/usr/sbin/sshd -D & sleep infinity"]
hostname = "worker1"
networks_advanced {
name = docker_network.hadoop_network.name
ipv4_address = "172.18.0.3"
}
}
resource "docker_container" "worker2" {
name = "worker2"
image = docker_image.custom_ubuntu.image_id
command = ["/bin/bash", "-c", "/usr/sbin/sshd -D & sleep infinity"]
hostname = "worker2"
networks_advanced {
name = docker_network.hadoop_network.name
ipv4_address = "172.18.0.4"
}
}
위 코드를 실행하여 EC2 인스턴스를 생성할 수 있습니다.
terraform init
terraform apply
3. SSH Key 생성 및 배포
Ansible로 클러스터 노드에 비밀번호 없는 SSH 접속을 설정하기 위해 SSH 키를 생성하고 배포합니다.
scripts/generate_ssh_key.sh
#!/bin/bash
# Path to the SSH key
SSH_KEY_PATH="$HOME/.ssh/id_rsa"
# Check if an existing key exists and remove it
if [ -f "$SSH_KEY_PATH" ]; then
echo "[INFO] Existing SSH key found: $SSH_KEY_PATH"
echo "[INFO] Overwriting the existing key."
rm -f "$SSH_KEY_PATH" "$SSH_KEY_PATH.pub"
fi
# Generate a new SSH key (without passphrase)
ssh-keygen -t rsa -b 4096 -f "$SSH_KEY_PATH" -N ""
# Set correct permissions for the key files
chmod 600 "$SSH_KEY_PATH"
# Verify key creation
if [ -f "$SSH_KEY_PATH" ] && [ -f "$SSH_KEY_PATH.pub" ]; then
echo "[SUCCESS] SSH key has been successfully generated!"
echo "[INFO] Public key: $SSH_KEY_PATH.pub"
echo "[INFO] Private key: $SSH_KEY_PATH"
else
echo "[ERROR] SSH key generation failed!"
exit 1
fi
# List of target hosts
HOSTS=("172.18.0.2" "172.18.0.3" "172.18.0.4")
# SSH username
USER="root"
# Iterate over each host and copy the SSH key
for HOST in "${HOSTS[@]}"; do
echo "[INFO] Copying SSH key to $USER@$HOST"
ssh-copy-id -o StrictHostKeyChecking=no "$USER@$HOST"
if [ $? -eq 0 ]; then
echo "[SUCCESS] SSH key copied to $USER@$HOST"
else
echo "[ERROR] Failed to copy SSH key to $USER@$HOST"
fi
done
4. Ansible을 이용한 Hadoop, HBase, Spark 설치
4.1. 하둡 환경 설정을 위한 필요 라이브러리 설치
setup_containers.yml
- name: Set up Hadoop containers
hosts: hadoop
become: yes
tasks:
# - name: Update package lists
# apt:
# update_cache: yes
- name: Upgrade packages
apt:
upgrade: yes
force_apt_get: yes
- name: Install required packages
apt:
name:
- curl
- rsync
- wget
- vim
- iputils-ping
- htop
- openjdk-11-jdk
- net-tools
- libxml2-dev
- libxslt1-dev
- python3-dev
- gcc
state: present
update_cache: yes
- name: Install Python
apt:
name: python3-pip
state: present
- name: Install packaging library
pip:
name: packaging
state: present
- name: Install lxml using pip
pip:
name: lxml
state: present
4.2 Hadoop 설치
setup_hadoop.yml
- name: Set up Hadoop in containers
hosts: hadoop
become: yes
tasks:
- name: Update package lists
apt:
update_cache: yes
- name: Upgrade packages
apt:
upgrade: yes
force_apt_get: yes
- name: Download Hadoop
get_url:
url: http://apache.mirror.cdnetworks.com/hadoop/common/hadoop-3.4.1/hadoop-3.4.1.tar.gz
dest: /root/hadoop-3.4.1.tar.gz
- name: Extract Hadoop
ansible.builtin.unarchive:
src: /root/hadoop-3.4.1.tar.gz
dest: /usr/local/
remote_src: yes
# Rename /usr/local/hadoop-3.4.1 to /usr/local/hadoop
- name: Rename Hadoop directory
command: mv /usr/local/hadoop-3.4.1 /usr/local/hadoop
args:
creates: /usr/local/hadoop
- name: Remove Hadoop archive
file:
path: /root/hadoop-3.4.1.tar.gz
state: absent
- name: Add environment variables to .bashrc
lineinfile:
path: /root/.bashrc
line: "{{ item }}"
state: present
loop:
- 'export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64'
- 'export HADOOP_HOME=/usr/local/hadoop'
- 'export PATH=$PATH:$HADOOP_HOME/bin'
- 'export PATH=$PATH:$HADOOP_HOME/sbin'
- 'export PATH=$PATH:$JAVA_HOME/bin'
- 'export HADOOP_MAPRED_HOME=$HADOOP_HOME'
- 'export HADOOP_COMMON_HOME=$HADOOP_HOME'
- 'export HADOOP_HDFS_HOME=$HADOOP_HOME'
- 'export YARN_HOME=$HADOOP_HOME'
- 'export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native'
- 'export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"'
- name: Source .bashrc
shell: "source /root/.bashrc"
args:
executable: /bin/bash
# core-site.xml
- name: Add fs.defaultFS property to core-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/core-site.xml
xpath: /configuration
add_children:
- property:
name: fs.defaultFS
value: hdfs://master1:8020
# yarn-site.xml
- name: Add YARN properties to yarn-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/yarn-site.xml
xpath: /configuration
add_children:
- property:
name: yarn.nodemanager.aux-services
value: mapreduce_shuffle
- property:
name: yarn.nodemanager.aux-services.mapreduce.shuffle.class
value: org.apache.hadoop.mapred.ShuffleHandler
# mapred-site.xml
- name: Add MapReduce properties to mapred-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/mapred-site.xml
xpath: /configuration
add_children:
- property:
name: mapreduce.framework.name
value: yarn
- property:
name: yarn.app.mapreduce.am.env
value: HADOOP_MAPRED_HOME=$HADOOP_HOME
- property:
name: mapreduce.map.env
value: HADOOP_MAPRED_HOME=$HADOOP_HOME
- property:
name: mapreduce.reduce.env
value: HADOOP_MAPRED_HOME=$HADOOP_HOME
# Ensure the masters file exists
- name: Create masters file if not present
copy:
content: ""
dest: /usr/local/hadoop/etc/hadoop/masters
mode: '0644'
# Ensure 'master1' is present in masters file
- name: Add master1 to masters file
lineinfile:
path: /usr/local/hadoop/etc/hadoop/masters
line: "master1"
state: present
# 10. Ensure 'master1', 'worker1', and 'worker2' are present in workers file
- name: Manage workers file
block:
- lineinfile:
path: /usr/local/hadoop/etc/hadoop/workers
regexp: '^localhost$'
state: absent
- lineinfile:
path: /usr/local/hadoop/etc/hadoop/workers
line: "{{ item }}"
state: present
loop:
- master1
- worker1
- worker2
- name: Setup SSH Key-based Authentication
hosts: all
become: yes
tasks:
- name: Ensure .ssh directory exists
file:
path: ~/.ssh
state: directory
mode: '0700'
- name: Generate SSH key if not exists
command: ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""
args:
creates: ~/.ssh/id_rsa
- name: Ensure public key file exists
stat:
path: ~/.ssh/id_rsa.pub
register: pub_key_file
- name: Fetch public key from each host
shell: cat ~/.ssh/id_rsa.pub
register: ssh_pub_key
changed_when: false
when: pub_key_file.stat.exists
- name: Store SSH public key in hostvars
set_fact:
my_ssh_key: "{{ ssh_pub_key.stdout }}"
- name: Aggregate all SSH keys on localhost
hosts: localhost
gather_facts: no
tasks:
- name: Fetch public key from localhost
shell: cat ~/.ssh/id_rsa.pub
register: local_ssh_key
changed_when: false
- name: Collect all SSH keys from hosts
set_fact:
global_ssh_keys: "{{ groups['all'] | map('extract', hostvars, 'my_ssh_key') | select('defined') | list }}"
- name: Add localhost SSH key to global_ssh_keys
set_fact:
global_ssh_keys: "{{ global_ssh_keys + [local_ssh_key.stdout] }}"
- name: Debug aggregated SSH keys
debug:
var: global_ssh_keys
- name: Convert list to string
set_fact:
ssh_keys: "{{ global_ssh_keys | join('\n') }}"
- name: Deploy SSH Keys to All Hosts
hosts: all
become: yes
tasks:
- name: Ensure authorized_keys file has correct permissions
file:
path: ~/.ssh/authorized_keys
state: touch
mode: '0600'
- name: Ensure authorized_keys contains all SSH keys
copy:
content: "{{ hostvars['localhost']['ssh_keys'] }}"
dest: ~/.ssh/authorized_keys
mode: '0600'
- name: Configure Hadoop on master1
hosts: master1
become: yes
tasks:
- name: Create Hadoop directories
shell: |
cd /usr/local/hadoop
mkdir -p hadoop_tmp/hdfs/namenode
mkdir -p hadoop_tmp/hdfs/datanode
chmod 777 hadoop_tmp/
- name: Update hadoop-env.sh
lineinfile:
path: /usr/local/hadoop/etc/hadoop/hadoop-env.sh
line: "{{ item }}"
insertafter: EOF
loop:
- 'export JAVA_HOME=''/usr/lib/jvm/java-11-openjdk-amd64'''
- 'export HDFS_NAMENODE_USER=root'
- 'export HDFS_DATANODE_USER=root'
- 'export HDFS_SECONDARYNAMENODE_USER=root'
- 'export YARN_NODEMANAGER_USER=root'
- 'export YARN_RESOURCEMANAGER_USER=root'
- name: Add HDFS properties to hdfs-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/hdfs-site.xml
xpath: /configuration
add_children:
- property:
name: dfs.replication
value: "3"
- property:
name: dfs.permissions
value: "false"
- property:
name: dfs.namenode.name.dir
value: /usr/local/hadoop/hadoop_tmp/hdfs/namenode
- property:
name: dfs.datanode.data.dir
value: /usr/local/hadoop/hadoop_tmp/hdfs/datanode
- name: Configure Hadoop on worker nodes
hosts: worker1,worker2
become: yes
tasks:
# 1. Create Hadoop directories
- name: Create Hadoop directories
shell: |
cd /usr/local/hadoop
mkdir -p hadoop_tmp/hdfs/datanode
chmod 777 hadoop_tmp/
args:
creates: /usr/local/hadoop/hadoop_tmp/hdfs/datanode
# 2. Update hadoop-env.sh
- name: Update hadoop-env.sh
lineinfile:
path: /usr/local/hadoop/etc/hadoop/hadoop-env.sh
line: 'export JAVA_HOME=''/usr/lib/jvm/java-11-openjdk-amd64'''
insertafter: EOF
# 3. Ensure <configuration> element exists in hdfs-site.xml
- name: Ensure <configuration> element exists in hdfs-site.xml
xml:
path: /usr/local/hadoop/etc/hadoop/hdfs-site.xml
xpath: /configuration
state: present
# 4. Add HDFS properties using the XML module
- name: Add HDFS properties to hdfs-site.xml
xml:
pretty_print: yes
path: /usr/local/hadoop/etc/hadoop/hdfs-site.xml
xpath: /configuration
add_children:
- property:
name: dfs.replication
value: "2"
- property:
name: dfs.permissions
value: "false"
- property:
name: dfs.datanode.data.dir
value: "/usr/local/hadoop/hadoop_tmp/hdfs/datanode"
4.3. Zookeeper 설치 및 구성
setup_zookeeper.yml
- name: Install and Configure ZooKeeper
hosts: all
become: yes
tasks:
- name: Download ZooKeeper
get_url:
url: https://dlcdn.apache.org/zookeeper/zookeeper-3.9.3/apache-zookeeper-3.9.3-bin.tar.gz
dest: /root
mode: '0644'
- name: Extract Zookeeper
ansible.builtin.unarchive:
src: /root/apache-zookeeper-3.9.3-bin.tar.gz
dest: /usr/local/
remote_src: yes
- name: Move ZooKeeper to correct directory
command: mv /usr/local/apache-zookeeper-3.9.3-bin /usr/local/zookeeper
- name: Change ownership of ZooKeeper directory
file:
path: /usr/local/zookeeper
owner: "root"
group: "root"
recurse: yes
- name: Remove Zookeeper archive
file:
path: /root/apache-zookeeper-3.9.3-bin.tar.gz
state: absent
- name: Copy zoo_sample.cfg to zoo.cfg
copy:
content: ""
dest: /usr/local/zookeeper/conf/zoo.cfg
mode: '0644'
- name: Add environment variables to .bashrc
lineinfile:
path: /root/.bashrc
line: "{{ item }}"
state: present
loop:
- 'export ZOOKEEPER_HOME=/usr/local/zookeeper'
- 'export PATH=$PATH:$ZOOKEEPER_HOME/bin'
- name: Configure zoo.cfg
block:
- name: Set dataDir in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "dataDir=/usr/local/zookeeper/data"
state: present
- name: Set clientPort in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "clientPort=2181"
state: present
- name: Set tickTime in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "tickTime=2000"
state: present
- name: Set initLimit in zoo.cfg
lineinfile:
line: "initLimit=10"
path: /usr/local/zookeeper/conf/zoo.cfg
state: present
- name: Set syncLimit in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "syncLimit=5"
state: present
- name: Set server configurations in zoo.cfg
lineinfile:
path: /usr/local/zookeeper/conf/zoo.cfg
line: "{{ item }}"
state: present
loop:
- "server.1=master1:2888:3888"
- "server.2=worker1:2888:3888"
- "server.3=worker2:2888:3888"
- name: Create data directory for ZooKeeper
file:
path: /usr/local/zookeeper/data
state: directory
mode: '0755'
- name: Ensure data directory exists
file:
path: /usr/local/zookeeper/data
state: directory
mode: '0755'
- name: Set myid for each node
block:
- name: Set myid for master1
copy:
content: "1"
dest: /usr/local/zookeeper/data/myid
mode: '0644'
when: inventory_hostname == "master1"
- name: Set myid for worker1
copy:
content: "2"
dest: /usr/local/zookeeper/data/myid
mode: '0644'
when: inventory_hostname == "worker1"
- name: Set myid for worker2
copy:
content: "3"
dest: /usr/local/zookeeper/data/myid
mode: '0644'
when: inventory_hostname == "worker2"
4.4 HBase 설치
setup_hbase.yml
- name: Set up HBase in containers
hosts: all
become: yes
tasks:
- name: Update package lists
apt:
update_cache: yes
- name: Upgrade packages
apt:
upgrade: yes
force_apt_get: yes
- name: Download HBase
get_url:
url: http://mirror.apache-kr.org/hbase/stable/hbase-2.5.10-bin.tar.gz
dest: /root/hbase-2.5.10-bin.tar.gz
- name: Extract HBase
ansible.builtin.unarchive:
src: /root/hbase-2.5.10-bin.tar.gz
dest: /usr/local/
remote_src: yes
# Rename /usr/local/hbase-2.5.10 to /usr/local/hbase
- name: Rename HBase directory
command: mv /usr/local/hbase-2.5.10 /usr/local/hbase
args:
creates: /usr/local/hbase
- name: Remove HBase archive
file:
path: /root/hbase-2.5.10-bin.tar.gz
state: absent
- name: Configure regionservers
block:
- name: Create regionservers file
copy:
content: ""
dest: /usr/local/hbase/conf/regionservers
mode: '0644'
- name: Set regionservers
lineinfile:
path: /usr/local/hbase/conf/regionservers
line: "{{ item }}"
state: present
loop:
- "worker1"
- "worker2"
- name: Add environment variables to .bashrc
lineinfile:
path: /root/.bashrc
line: "{{ item }}"
state: present
loop:
- 'export HBASE_HOME=/usr/local/hbase'
- 'export HBASE_CONF_DIR=/usr/local/hbase/conf'
- 'export PATH=$PATH:/usr/local/hbase/bin'
- 'export HBASE_LOG_DIR=$HADOOP_LOG_DIR'
- name: Source .bashrc
shell: "source /root/.bashrc"
args:
executable: /bin/bash
- name: Remove hbase-site.xml if it exists
file:
path: /usr/local/hbase/conf/hbase-site.xml
state: absent
- name: Create hbase-site.xml with basic structure if it does not exist
copy:
dest: /usr/local/hbase/conf/hbase-site.xml
content: |
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
</configuration>
mode: '0644'
force: yes # 파일이 이미 존재하면 덮어쓰기
- name: Create and add HBase properties to hbase-site.xml
xml:
pretty_print: yes
path: /usr/local/hbase/conf/hbase-site.xml
state: present
xpath: /configuration
add_children:
- property:
name: hbase.master.hostname
value: master1
- property:
name: hbase.master
value: master1:16000
- property:
name: hbase.wal.provider
value: filesystem
- property:
name: hbase.rest.port
value: "8081"
- property:
name: hbase.regionserver.port
value: "16020"
- property:
name: hbase.rootdir
value: hdfs://master1:8020/hbase
- property:
name: hbase.cluster.distributed
value: "true"
- property:
name: hbase.zookeeper.property.clientPort
value: "2181"
- property:
name: hbase.zookeeper.quorum
value: master1,worker1,worker2
- property:
name: hbase.zookeeper.property.dataDir
value: /usr/local/zookeeper
- property:
name: hbase.rpc.controllerfactory.class
value: org.apache.hadoop.hbase.ipc.RpcControllerFactory
- name: Configure hbase-env.sh
become: yes
blockinfile:
path: /usr/local/hbase/conf/hbase-env.sh
block: |
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
export HBASE_MANAGES_ZK=false
export HBASE_REGIONSERVERS=/usr/local/hbase/conf/regionservers
export HBASE_OPTS="$HBASE_OPTS -Dfile.encoding=UTF-8"
marker: "# {mark} HBase environment settings"
state: present
- name: Configure backup-masters
copy:
content: "worker1"
dest: /usr/local/hbase/conf/backup-masters
mode: '0644'
4.5 Spark 설치
setup_spark.yml
- name: Install and Configure Spark
hosts: all
become: yes
vars:
HBASE_HOME: /usr/local/hbase
JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64
HADOOP_HOME: /usr/local/hadoop
tasks:
- name: Download Spark
get_url:
url: https://dlcdn.apache.org/spark/spark-3.4.4/spark-3.4.4-bin-hadoop3.tgz
dest: /root
mode: '0644'
- name: Extract Spark
ansible.builtin.unarchive:
src: /root/spark-3.4.4-bin-hadoop3.tgz
dest: /usr/local/
remote_src: yes
- name: Move Spark to correct directory
command: mv /usr/local/spark-3.4.4-bin-hadoop3 /usr/local/spark
- name: Remove downloaded Spark archive
file:
path: /root/spark-3.4.4-bin-hadoop3.tgz
state: absent
- name: Change ownership of Spark directory
file:
path: /usr/local/spark
owner: "root"
group: "root"
recurse: yes
- name: Add environment variables to .bashrc
lineinfile:
path: /root/.bashrc
line: "{{ item }}"
state: present
loop:
- 'export SPARK_HOME=/usr/local/spark'
- 'export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin'
- name: Ensure /usr/local/spark/lib directory exists
file:
path: /usr/local/spark/lib
state: directory
mode: '0755'
- name: Copy JAR files from local to remote
copy:
src: "{{ item }}"
dest: /usr/local/spark/lib/
mode: '0644'
with_fileglob:
- "../lib/*.jar"
- name: Copy JAR files from local to remote
copy:
src: "{{ item }}"
dest: /usr/local/hbase/lib/
mode: '0644'
with_fileglob:
- "../lib/*.jar"
- name: Copy spark-env.sh template
command: cp /usr/local/spark/conf/spark-env.sh.template /usr/local/spark/conf/spark-env.sh
- name: Configure spark-env.sh
block:
- name: Set Spark environment variables
lineinfile:
path: /usr/local/spark/conf/spark-env.sh
line: "{{ item }}"
state: present
with_items:
- "export SPARK_MASTER_HOST=master1"
- "export SPARK_MASTER_PORT=7077"
- "export SPARK_MASTER_WEBUI_PORT=8082"
- "export SPARK_WORKER_WEBUI_PORT=8083"
- "export SPARK_WORKER_CORES=1"
- "export SPARK_WORKER_MEMORY=2g"
- "export JAVA_HOME={{ JAVA_HOME }}"
- "export HADOOP_HOME={{ HADOOP_HOME }}"
- "export YARN_CONF_DIR={{ HADOOP_HOME }}/etc/hadoop"
- "export HADOOP_CONF_DIR={{ HADOOP_HOME }}/etc/hadoop"
- "export SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*:{{ HBASE_HOME }}/lib/*"
- name: Copy spark-defaults.conf template
command: cp /usr/local/spark/conf/spark-defaults.conf.template /usr/local/spark/conf/spark-defaults.conf
- name: Configure spark-defaults.conf
block:
- name: Set Spark default packages
lineinfile:
path: /usr/local/spark/conf/spark-defaults.conf
line: "{{ item }}"
state: present
with_items:
- "spark.jars /usr/local/spark/lib/hbase-spark-1.1.0-SNAPSHOT.jar,/usr/local/spark/lib/hbase-spark-protocol-shaded-1.1.0-SNAPSHOT.jar"
- name: Create workers file if it does not exist
command: touch /usr/local/spark/conf/workers
- name: Add workers in workers file
blockinfile:
path: /usr/local/spark/conf/workers
block: |
worker1
worker2
state: present
- name: Configure hbase-env.sh
lineinfile:
path: "{{ HBASE_HOME }}/conf/hbase-env.sh"
line: "export HBASE_CLASSPATH={{ HBASE_HOME }}/lib/scala-library-2.12.17.jar:{{ HBASE_HOME }}/lib/hbase-spark-1.1.0-SNAPSHOT.jar:{{ HBASE_HOME }}/lib/hbase-spark-protocol-shaded-1.1.0-SNAPSHOT.jar"
state: present
insertafter: EOF
5. Hadoop 및 HBase 실행
5.1 Hadoop 실행
start_hadoop.yml
- name: Format HDFS NameNode and start Hadoop services
hosts: master1
become: yes
tasks:
- name: Format HDFS NameNode
environment:
PATH: "{{ ansible_env.PATH }}:/usr/local/hadoop/bin"
shell: hdfs namenode -format
register: format_output
- name: Display format output
debug:
var: format_output.stdout
- name: Start Hadoop services
shell: /usr/local/hadoop/sbin/start-all.sh
register: start_output
- name: Display start output
debug:
var: start_output.stdout
- name: Check JPS
shell: jps
register: jps_output
- name: Display JPS output
debug:
var: jps_output.stdout
ansible-playbook start_hadoop.yml
5.2 Zookeeper실행
start_zookeeper.yml
- name: Start Zookeepers in all nodes
hosts: master1,worker1,worker2
become: yes
tasks:
- name: Start Zookeepers
environment:
PATH: "{{ ansible_env.PATH }}:/usr/local/zookeeper/bin"
shell: zkServer.sh start
register: zk_start_output
- name: Display zk_start output
debug:
var: zk_start_output.stdout
- name: Show Zookeepers status
shell: /usr/local/zookeeper/bin/zkServer.sh status
register: zk_status_output
- name: Display status output
debug:
var: zk_status_output.stdout
- name: Check JPS
shell: jps
register: jps_output
- name: Display JPS output
debug:
var: jps_output.stdout
ansible-playbook start_zookeeper.yml
5.3 HBase 실행
docker exec -it master1 sudo /usr/local/hbase/bin/start-hbase.sh
docker exec -it master1 sudo /usr/local/hbase/bin/hbase-daemon.sh start rest
5.4 Spark Connect 실행
docker exec -it /usr/local/spark/sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:3.4.4
6. 클러스터 제거 (Terraform destroy)
설치한 클러스터를 삭제하려면 다음 스크립트를 실행합니다.
./destroy_all.sh
이 글에서는 Terraform을 활용한 인프라 생성, Ansible을 사용한 Hadoop, HBase, Spark의 자동 설치 및 실행 방법을 다루었습니다. 이를 통해 자동화된 빅데이터 환경을 쉽게 구축할 수 있습니다.

'DevOps' 카테고리의 다른 글
Terraform으로 배포한 Ubuntu 컨테이너를 Ansible로 구성하기 (0) | 2025.02.20 |
---|---|
Terraform으로 도커 컨테이너 구성하기 (0) | 2025.02.04 |