Yuri Zhylyuk Google Dataproc cluster init script with Apache Spark, Python 3 (miniconda) and some pre-installed libraries for data processing

Created by Yuri Zhylyuk last modified

copy this shell script to dataproc init directory: gsutil cp jupyter-spark.sh gs://dataproc-inits/

start cluster: gcloud dataproc clusters create jupyter-1 --zone asia-east1-b --master-machine-type n1-standard-2 --master-boot-disk-size 100 --num-workers 3 --worker-machine-type n1-standard-4 --worker-boot-disk-size 50 --project spark-recommendation-engine --initialization-actions gs://dataproc-inits/jupyter-spark.sh --scopes 'https://www.googleapis.com/auth/cloud-platform' --properties spark:spark.executorEnv.PYTHONHASHSEED=0

change number of workers: gcloud dataproc clusters update jupyter-1 --num-workers 3

initiate ssh channel: gcloud compute ssh --zone=asia-east1-b --ssh-flag="-D 1080" --ssh-flag="-N" --ssh-flag="-n" jupyter-1-m

start jupyter session: chromium-browser --proxy-server="socks5://localhost:1080" --host-resolver-rules="MAP * 0.0.0.0, EXCLUDE localhost" --user-data-dir=/tmp/

#!/bin/bash

sudo apt-get -y install bzip2 curl && \
(cd ~ && curl -O http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh) && \
chmod +x ~/Miniconda3-latest-Linux-x86_64.sh && \
~/Miniconda3-latest-Linux-x86_64.sh -b && \ 
rm ~/Miniconda3-latest-Linux-x86_64.sh && \
# PATH=/miniconda3/bin:$PATH && \
export PATH=/miniconda3/bin:$PATH && \
sudo echo 'PATH=/miniconda3/bin:$PATH' >> /etc/environment && \
export PYTHON_PATH=/miniconda3/bin && \
sudo echo 'PYTHON_PATH=/miniconda3/bin' >> /etc/environment && \
conda update -y conda && \
pip install --upgrade pip && \
conda install -y numpy pandas SQLAlchemy requests lxml dask && \
sudo apt-get clean autoclean && \
sudo apt-get autoremove -y

export PYSPARK_PYTHON=/miniconda3/bin/python
export PYSPARK_DRIVER_PYTHON=/miniconda3/bin/python
export PYTHONHASHSEED=0

echo "export PYSPARK_PYTHON=/miniconda3/bin" | sudo tee -a  /etc/profile.d/spark_config.sh  /etc/*bashrc
echo "export PYTHONHASHSEED=0" | sudo tee -a  /etc/profile.d/spark_config.sh  /etc/*bashrc /usr/lib/spark/conf/spark-env.sh
sudo echo "spark.executorEnv.PYTHONHASHSEED=0" >> /etc/spark/conf/spark-defaults.conf


# Only run on the master node
ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
if [[ "${ROLE}" == 'Master' ]]; then
	
	# Install iPython Notebook and create a profile
	mkdir IPythonNB
	cd IPythonNB
	pip install "ipython[notebook]"
	ipython profile create default

	# Set up configuration for iPython Notebook
	echo "c = get_config()" >  /root/.ipython/profile_default/ipython_notebook_config.py
	echo "c.NotebookApp.ip = '*'" >>  /root/.ipython/profile_default/ipython_notebook_config.py
	echo "c.NotebookApp.open_browser = False"  >>  /root/.ipython/profile_default/ipython_notebook_config.py
	echo "c.NotebookApp.port = 8123" >>  /root/.ipython/profile_default/ipython_notebook_config.py
	
	# Setup script for iPython Notebook so it uses the cluster's Spark
	cat > /root/.ipython/profile_default/startup/00-pyspark-setup.py <<'_EOF'
import os
import sys

spark_home = '/usr/lib/spark/'
os.environ["SPARK_HOME"] = spark_home
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.9-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
_EOF

	##################
	# install gcsfuse and keep notebooks in a bucket for persistency 
	export GCSFUSE_REPO=gcsfuse-`lsb_release -c -s`
	echo "deb http://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
	curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
	sudo apt-get update
	sudo apt-get install -y gcsfuse
	sudo gcsfuse jupyter-notebooks /IPythonNB
	##################
	
	# Start Jupyter Notebook on port 8123
	nohup ipython notebook --no-browser --ip=* --port=8123 > /var/log/python_notebook.log &
fi

Comments (0)