Вы находитесь на странице: 1из 4

# Setup Hadoop-3.1.1 & spark-2.4.

0 cluster
# using ubuntu 18.04

#1
sudo vi /etc/hosts
172.20.10.4 server
172.20.10.5 slave1
172.20.10.6 slave2

#2 vi /etc/netplan/50-cloud-init.yml
#changing to static ip

network:
ethernets:
enp0s3:
addresses: [172.20.10.4/24]
gateway4: 172.20.10.1
nameservers:
addresses: [8.8.8.8,8.8.4.4]
dhcp4: no
version: 2

sudo netplan apply


#3 change name by editing /etc/hostname or hostnamectl
sudo hostnamectl set-hostname master
hostname
sudo usermod -aG sudo ziyati

#4 connect to master to install openssh-server


sudo apt-get remove --purge openssh-server
sudo apt-get install openssh-server
#if problem during install openssh-server
sudo apt-get install aptitude
sudo aptitude install openssh-client=required_version
sudo aptitude install openssh-client=1:7.6p1-4
# Problem install openssh-server
ssh-keygen -t rsa -P ""
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 0600 ~/.ssh/authorized_keys

#5 Test secure connection


ssh ziyati@localhost
logout

#6 install java
sudo apt install openjdk-8-jdk
update-java-alternatives -l

# may be problems occurs during install


## then
sudo rm /var/lib/dpkg/updates/000*
sudo apt-get clean
sudo apt-get update
sudo apt-get install ttf-mscorefonts-installer

#7 download hadoop
curl -O http://mirror.cogentco.com/pub/apache/hadoop/common/hadoop-3.1.1/hadoop-
3.1.1.tar.gz
tar -xzf hadoop-3.1.2.tar.gz
sudo mv hadoop-3.1.2 /usr/local/hadoop
mkdir -p /home/ziyati/hadoop_tmp/{data,name}
rm hadoop-3.1.2.tar.gz

#7.1 Set up hadoop environment variables.

echo export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64 >> ~/.bashrc


echo export PATH=\$JAVA_HOME/bin:\$PATH >> ~/.bashrc
echo export HADOOP_HOME=/usr/local/hadoop >> ~/.bashrc
echo export PATH=\$HADOOP_HOME/bin:\$HADOOP_HOME/sbin:\$PATH >> ~/.bashrc
echo export HADOOP_CONF_DIR=\$HADOOP_HOME"/etc/hadoop" >> ~/.bashrc

source ~/.bashrc

#7.2 Copy configuration files for master node.


cp master/* /usr/local/hadoop/etc/hadoop/

#8 download spark
curl -O https://www-eu.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-
hadoop2.7.tgz
tar -xzf spark-2.4.3-bin-hadoop2.7.tgz
sudo mv spark-2.4.3-bin-hadoop2.7 /usr/local/spark
rm spark-2.4.0-bin-hadoop2.7.tgz

#8.1 Set up spark environment variables.

echo export SPARK_HOME=/usr/local/spark >> ~/.bashrc


echo export PATH=\$SPARK_HOME/bin:\$PATH >> ~/.bashrc
echo export PATH=\$SPARK_HOME/sbin:\$PATH >> ~/.bashrc
source .bashrc

#8.2 Set up spark files.


vi $SPARK_HOME/conf/slaves
172.20.10.5
172.20.10.6
#9 Clone VM to slave1 and slave2
# change IP and hostname

after change ip
you have to actualize

sudo netplan apply

# Format HDFS
hdfs namenode -format

#10 Start hadoop services


cd /usr/local/hadoop/sbin
./start-dfs.sh
./start-yarn.sh

#11 Connect to master to verify

http://172.20.10.4:9870
http://172.20.10.4:8088

#start spark
cd /usr/local/spark/sbin

./start-all.sh
http://172.20.10.4:8080
spark should launched from master

############
# Thanks !
############

#all services are up in server


#check in slave1
jps

# Complete the env Anaconda


# Installing Jupyter
curl -O https://repo.anaconda.com/archive/Anaconda3-2019.03-Linux-x86_64.sh
bash Anaconda3-2019.03-Linux-x86_64.sh

# create a virtual env called jupyter


conda create -n jupyter
# activate it
source activate jupyter
conda install notebook

# start jupyter using ip (Server)


jupyter notebook --ip 172.20.10.4

# Installing findspark
# findspark is a Python library that automatically allow you to import and use
PySpark as any other Python library.

pip install findspark

# Create your first Spark application


#Node1
import findspark
#Node2
findspark.init()
#Node3
import pyspark
#Node4
sc = pyspark.SparkContext(master='spark://172.20.10.4:7077', appName='myApp')

Вам также может понравиться