This is an init script for starting Google Dataproc cluster with Apache Spark, Python 3 (miniconda) with some pre-installed libraries for data processing

copy this shell script to dataproc init directory: gsutil cp gs://dataproc-inits/

start cluster: gcloud dataproc clusters create jupyter-1 --zone asia-east1-b --master-machine-type n1-standard-2 --master-boot-disk-size 100 --num-workers 3 --worker-machine-type n1-standard-4 --worker-boot-disk-size 50 --project spark-recommendation-engine --initialization-actions gs://dataproc-inits/ --scopes '' --properties spark:spark.executorEnv.PYTHONHASHSEED=0

change number of workers: gcloud dataproc clusters update jupyter-1 --num-workers 3

initiate ssh channel: gcloud compute ssh --zone=asia-east1-b --ssh-flag="-D 1080" --ssh-flag="-N" --ssh-flag="-n" jupyter-1-m

start jupyter session: chromium-browser --proxy-server="socks5://localhost:1080" --host-resolver-rules="MAP *, EXCLUDE localhost" --user-data-dir=/tmp/


sudo apt-get -y install bzip2 curl && \
(cd ~ && curl -O && \
chmod +x ~/ && \
~/ -b && \ 
rm ~/ && \
# PATH=/miniconda3/bin:$PATH && \
export PATH=/miniconda3/bin:$PATH && \
sudo echo 'PATH=/miniconda3/bin:$PATH' >> /etc/environment && \
export PYTHON_PATH=/miniconda3/bin && \
sudo echo 'PYTHON_PATH=/miniconda3/bin' >> /etc/environment && \
conda update -y conda && \
pip install --upgrade pip && \
conda install -y numpy pandas SQLAlchemy requests lxml dask && \
sudo apt-get clean autoclean && \
sudo apt-get autoremove -y

export PYSPARK_PYTHON=/miniconda3/bin/python
export PYSPARK_DRIVER_PYTHON=/miniconda3/bin/python

echo "export PYSPARK_PYTHON=/miniconda3/bin" | sudo tee -a  /etc/profile.d/  /etc/*bashrc
echo "export PYTHONHASHSEED=0" | sudo tee -a  /etc/profile.d/  /etc/*bashrc /usr/lib/spark/conf/
sudo echo "spark.executorEnv.PYTHONHASHSEED=0" >> /etc/spark/conf/spark-defaults.conf

# Only run on the master node
ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
if [[ "${ROLE}" == 'Master' ]]; then
	# Install iPython Notebook and create a profile
	mkdir IPythonNB
	cd IPythonNB
	pip install "ipython[notebook]"
	ipython profile create default

	# Set up configuration for iPython Notebook
	echo "c = get_config()" >  /root/.ipython/profile_default/
	echo "c.NotebookApp.ip = '*'" >>  /root/.ipython/profile_default/
	echo "c.NotebookApp.open_browser = False"  >>  /root/.ipython/profile_default/
	echo "c.NotebookApp.port = 8123" >>  /root/.ipython/profile_default/
	# Setup script for iPython Notebook so it uses the cluster's Spark
	cat > /root/.ipython/profile_default/startup/ <<'_EOF'
import os
import sys

spark_home = '/usr/lib/spark/'
os.environ["SPARK_HOME"] = spark_home
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/'))
execfile(os.path.join(spark_home, 'python/pyspark/'))

	# install gcsfuse and keep notebooks in a bucket for persistency 
	export GCSFUSE_REPO=gcsfuse-`lsb_release -c -s`
	echo "deb $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
	curl | sudo apt-key add -
	sudo apt-get update
	sudo apt-get install -y gcsfuse
	sudo gcsfuse jupyter-notebooks /IPythonNB
	# Start Jupyter Notebook on port 8123
	nohup ipython notebook --no-browser --ip=* --port=8123 > /var/log/python_notebook.log &

