Revised by
Yuri Zhylyuk
2016-06-18
5b7ef6f
copy this shell script to dataproc init directory:
gsutil cp jupyter-spark.sh gs://dataproc-inits/
start cluster:
gcloud dataproc clusters create jupyter-1 --zone asia-east1-b --master-machine-type n1-standard-2 --master-boot-disk-size 100 --num-workers 3 --worker-machine-type n1-standard-4 --worker-boot-disk-size 50 --project spark-recommendation-engine --initialization-actions gs://dataproc-inits/jupyter-spark.sh --scopes 'https://www.googleapis.com/auth/cloud-platform' --properties spark:spark.executorEnv.PYTHONHASHSEED=0
change number of workers:
gcloud dataproc clusters update jupyter-1 --num-workers 3
initiate ssh channel:
gcloud compute ssh --zone=asia-east1-b --ssh-flag="-D 1080" --ssh-flag="-N" --ssh-flag="-n" jupyter-1-m
start jupyter session:
chromium-browser --proxy-server="socks5://localhost:1080" --host-resolver-rules="MAP * 0.0.0.0, EXCLUDE localhost" --user-data-dir=/tmp/
#!/bin/bash
sudo apt-get -y install bzip2 curl && \
( cd ~ && curl -O http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh) && \
chmod +x ~/Miniconda3-latest-Linux-x86_64.sh && \
~/Miniconda3-latest-Linux-x86_64.sh -b && \
rm ~/Miniconda3-latest-Linux-x86_64.sh && \
# PATH=/miniconda3/bin:$PATH && \
export PATH = /miniconda3/bin:$PATH && \
sudo echo 'PATH=/miniconda3/bin:$PATH' >> /etc/environment && \
export PYTHON_PATH = /miniconda3/bin && \
sudo echo 'PYTHON_PATH=/miniconda3/bin' >> /etc/environment && \
conda update -y conda && \
pip install --upgrade pip && \
conda install -y numpy pandas SQLAlchemy requests lxml dask && \
sudo apt-get clean autoclean && \
sudo apt-get autoremove -y
export PYSPARK_PYTHON = /miniconda3/bin/python
export PYSPARK_DRIVER_PYTHON = /miniconda3/bin/python
export PYTHONHASHSEED = 0
echo "export PYSPARK_PYTHON=/miniconda3/bin" | sudo tee -a /etc/profile.d/spark_config.sh /etc/*bashrc
echo "export PYTHONHASHSEED=0" | sudo tee -a /etc/profile.d/spark_config.sh /etc/*bashrc /usr/lib/spark/conf/spark-env.sh
sudo echo "spark.executorEnv.PYTHONHASHSEED=0" >> /etc/spark/conf/spark-defaults.conf
# Only run on the master node
ROLE = $( /usr/share/google/get_metadata_value attributes/dataproc-role)
if [[ " ${ ROLE } " == 'Master' ]] ; then
# Install iPython Notebook and create a profile
mkdir IPythonNB
cd IPythonNB
pip install "ipython[notebook]"
ipython profile create default
# Set up configuration for iPython Notebook
echo "c = get_config()" > /root/.ipython/profile_default/ipython_notebook_config.py
echo "c.NotebookApp.ip = '*'" >> /root/.ipython/profile_default/ipython_notebook_config.py
echo "c.NotebookApp.open_browser = False" >> /root/.ipython/profile_default/ipython_notebook_config.py
echo "c.NotebookApp.port = 8123" >> /root/.ipython/profile_default/ipython_notebook_config.py
# Setup script for iPython Notebook so it uses the cluster's Spark
cat > /root/.ipython/profile_default/startup/00-pyspark-setup.py <<'_EOF'
import os
import sys
spark_home = '/usr/lib/spark/'
os.environ["SPARK_HOME"] = spark_home
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.9-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
_EOF
##################
# install gcsfuse and keep notebooks in a bucket for persistency
export GCSFUSE_REPO = gcsfuse-` lsb_release -c -s`
echo "deb http://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
sudo apt-get update
sudo apt-get install -y gcsfuse
sudo gcsfuse jupyter-notebooks /IPythonNB
##################
# Start Jupyter Notebook on port 8123
nohup ipython notebook --no-browser --ip= * --port= 8123 > /var/log/python_notebook.log &
fi