Snippets

Yuri Zhylyuk Google Dataproc cluster init script with Apache Spark, Python 3 (miniconda) and some pre-installed libraries for data processing

You are viewing an old version of this snippet. View the current version.
Revised by Yuri Zhylyuk 5b7ef6f

This is an init script for starting Google Dataproc cluster with Apache Spark, Python 3 (miniconda) with some pre-installed libraries for data processing

copy this shell script to dataproc init directory: gsutil cp jupyter-spark.sh gs://dataproc-inits/

start cluster: gcloud dataproc clusters create jupyter-1 --zone asia-east1-b --master-machine-type n1-standard-2 --master-boot-disk-size 100 --num-workers 3 --worker-machine-type n1-standard-4 --worker-boot-disk-size 50 --project spark-recommendation-engine --initialization-actions gs://dataproc-inits/jupyter-spark.sh --scopes 'https://www.googleapis.com/auth/cloud-platform' --properties spark:spark.executorEnv.PYTHONHASHSEED=0

change number of workers: gcloud dataproc clusters update jupyter-1 --num-workers 3

initiate ssh channel: gcloud compute ssh --zone=asia-east1-b --ssh-flag="-D 1080" --ssh-flag="-N" --ssh-flag="-n" jupyter-1-m

start jupyter session: chromium-browser --proxy-server="socks5://localhost:1080" --host-resolver-rules="MAP * 0.0.0.0, EXCLUDE localhost" --user-data-dir=/tmp/

#!/bin/bash

sudo apt-get -y install bzip2 curl && \
(cd ~ && curl -O http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh) && \
chmod +x ~/Miniconda3-latest-Linux-x86_64.sh && \
~/Miniconda3-latest-Linux-x86_64.sh -b && \ 
rm ~/Miniconda3-latest-Linux-x86_64.sh && \
# PATH=/miniconda3/bin:$PATH && \
export PATH=/miniconda3/bin:$PATH && \
sudo echo 'PATH=/miniconda3/bin:$PATH' >> /etc/environment && \
export PYTHON_PATH=/miniconda3/bin && \
sudo echo 'PYTHON_PATH=/miniconda3/bin' >> /etc/environment && \
conda update -y conda && \
pip install --upgrade pip && \
conda install -y numpy pandas SQLAlchemy requests lxml dask && \
sudo apt-get clean autoclean && \
sudo apt-get autoremove -y

export PYSPARK_PYTHON=/miniconda3/bin/python
export PYSPARK_DRIVER_PYTHON=/miniconda3/bin/python
export PYTHONHASHSEED=0

echo "export PYSPARK_PYTHON=/miniconda3/bin" | sudo tee -a  /etc/profile.d/spark_config.sh  /etc/*bashrc
echo "export PYTHONHASHSEED=0" | sudo tee -a  /etc/profile.d/spark_config.sh  /etc/*bashrc /usr/lib/spark/conf/spark-env.sh
sudo echo "spark.executorEnv.PYTHONHASHSEED=0" >> /etc/spark/conf/spark-defaults.conf


# Only run on the master node
ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
if [[ "${ROLE}" == 'Master' ]]; then
	
	# Install iPython Notebook and create a profile
	mkdir IPythonNB
	cd IPythonNB
	pip install "ipython[notebook]"
	ipython profile create default

	# Set up configuration for iPython Notebook
	echo "c = get_config()" >  /root/.ipython/profile_default/ipython_notebook_config.py
	echo "c.NotebookApp.ip = '*'" >>  /root/.ipython/profile_default/ipython_notebook_config.py
	echo "c.NotebookApp.open_browser = False"  >>  /root/.ipython/profile_default/ipython_notebook_config.py
	echo "c.NotebookApp.port = 8123" >>  /root/.ipython/profile_default/ipython_notebook_config.py
	
	# Setup script for iPython Notebook so it uses the cluster's Spark
	cat > /root/.ipython/profile_default/startup/00-pyspark-setup.py <<'_EOF'
import os
import sys

spark_home = '/usr/lib/spark/'
os.environ["SPARK_HOME"] = spark_home
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.9-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
_EOF

	##################
	# install gcsfuse and keep notebooks in a bucket for persistency 
	export GCSFUSE_REPO=gcsfuse-`lsb_release -c -s`
	echo "deb http://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
	curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
	sudo apt-get update
	sudo apt-get install -y gcsfuse
	sudo gcsfuse jupyter-notebooks /IPythonNB
	##################
	
	# Start Jupyter Notebook on port 8123
	nohup ipython notebook --no-browser --ip=* --port=8123 > /var/log/python_notebook.log &
fi
HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.