From 084d7fdc2c46afa71f3964924179c16236b1beb1 Mon Sep 17 00:00:00 2001 From: Antoine Rey Date: Mon, 31 Aug 2015 21:50:19 +0200 Subject: [PATCH] #2 adding a docker compose file to make easier setup of the musicbrainz database and the elastic search cluster --- README.md | 49 ++++++++++++----- docker/docker-compose.yml | 11 ++++ docker/postgres-dockerfile/Dockerfile | 13 +++++ docker/postgres-dockerfile/create-database.sh | 52 +++++++++++++++++++ docker/postgres-dockerfile/postgres.env | 2 + .../batch/es-musicbrainz-batch.properties | 6 +-- 6 files changed, 118 insertions(+), 15 deletions(-) create mode 100644 docker/docker-compose.yml create mode 100644 docker/postgres-dockerfile/Dockerfile create mode 100755 docker/postgres-dockerfile/create-database.sh create mode 100644 docker/postgres-dockerfile/postgres.env diff --git a/README.md b/README.md index dbe1bb6..f773e56 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,32 @@ This project depends on several other open source projects: ## Prerequisites ## -### 1. MusicBrainz ### +A MusicBrainz database and an Elasticsearch cluster are the 2 pre-requisites in order to execute the batch. +You have the choice by setting by yourself a MusicBrainz database and an Elasticsearch cluster or to use **Docker**. + +### Automatic installation with Docker + +Use [Docker Compose](https://docs.docker.com/compose/) to set up both a PostgreSQL database and an Elasticsearch cluster and import the musicbrainz database. + +If you are on MacOS or Windows, you have to install [Boot2docker](http://boot2docker.io/) in order to user Docker and Docker Compose. [You will have to increase the DiskSize up to 100 Gb](https://docs.docker.com/articles/b2d_volume_resize/). + +Command lines to start PostgreSQL and Elasticsearch: + +* `git clone https://github.com/arey/musicbrainz-elasticsearch.git` +* `cd docker-es-musicbrainz` +* `docker-compose up -d` +* `docker-compose run postgresql /create-database.sh` +* If you are using Boot2docker: +** `boot2docker ip` +** edit the `es-musicbrainz-batch.properties` file and replace *localhost* with the IP in the *es.host* and *db.musicbrainz.url* properties. + +The last command line creates the database, downloads the latest dumps then populates the database. +Depending your bandwidth, downloading of the *mbdump.tar.bz2* could be take more than hour. + + +### Manual installation + +#### 1. MusicBrainz ### To index MusicBrainz data, **the batch requires a connection to the MusicBrainz PostgreSQL relational database**.
[Musicbrainz.org](http://Musicbrainz.org "Musicbrainz.org") does not provide a public access to its database. Thus you have to install your own database. @@ -35,16 +60,18 @@ There are a two different methods to get a local database up and running, you ca * Download a pre-configured [virtual image of the MusicBrainz Server](http://musicbrainz.org/doc/MusicBrainz_Server/Setup), or * Download the last [data dumps](http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/) and follow the relevant section of the [INSTALL.md](https://github.com/metabrainz/musicbrainz-server/blob/master/INSTALL.md) -For my part, I have chosen to download the MusicBrainz Server virtual machine. Available in Open Virtualization Archive (OVA), I have deployed it into [Oracle VirtualBox](https://www.virtualbox.org/) but you may prefer VMWare.
+For my part, before using Docker, I have chosen to download the MusicBrainz Server virtual machine. Available in Open Virtualization Archive (OVA), I have deployed it into [Oracle VirtualBox](https://www.virtualbox.org/) but you may prefer VMWare.
Once finished the [MusicBrainz Server setup guide](http://musicbrainz.org/doc/MusicBrainz_Server/Setup), you have to follow the below two final steps in order the PostgreSQL database be accessible to your host: -1. **Configuring port forwarding with NAT**
+**Configuring port forwarding with NAT** + Port forwarding enables VirtualBox to listen to certain ports on the host and resends all packets which arrive there to the guest, on the same or a different port. You may used same port on host and guest. Configure two rules (the second is optional): - PostgreSQL database - TCP - host : 5432 / guest : 5432 - MusicBrainz web server : TCP - host : 5000 / guest : 5000 -2. **Configuring PostgreSQL**
+**Configuring PostgreSQL** + To enable remote access to the PostgreSQL database server, you may follow [those instructions](http://www.cyberciti.biz/tips/postgres-allow-remote-access-tcp-connection.html "How Do I Enable remote access to PostgreSQL database server?"). Log into the VM (credentials: vm / musicbrainz) and edit the two configuration files pg_hba.conf and postgresql.conf. Once steps done, you may connect to the database with any JDBC clients (ie. [SQuireL](http://squirrel-sql.sourceforge.net/ "SQuirreL SQL Client")): @@ -52,21 +79,19 @@ Once steps done, you may connect to the database with any JDBC clients (ie. [SQu * URL: jdbc:postgresql://localhost:5432/musicbrainz * Credentials: musicbrainz / musicbrainz -### 2. Elasticsearch ### +#### 2. Elasticsearch ### -Before launching the batch, you have to [download Elasticsearch v0.90.5](http://www.elasticsearch.org/download/) and configure it. -One unziped, edit the config/elaticsearch.yml configuration file. Uncomment the _cluster.name_ line and set it with the _musicbrainz_ cluster name: -`cluster.name: musicbrainz` -You may also prefer to keep the default _elasticsearch_ cluster name and change the name in the es-musicbrainz-batch.properties configuration file. +Before launching the batch, you have to [download Elasticsearch v1.7.1](https://www.elastic.co/downloads/elasticsearch) and unarchived it. +You may want to change the default _elasticsearch_ cluster name from the config/elaticsearch.yml configuration file and change the name in the es-musicbrainz-batch.properties configuration file. ## Quick Start ## * `git clone https://github.com/arey/musicbrainz-elasticsearch.git` -* start Elasticsearch -* start MusicBrainz database or VM * `mvn install` * `mvn exec:java` (execute the *IndexBatchMain* main class) +On a Macbook Pro, the batch takes less than 3 minutes to build the Elasticsearch. + ## Demo MusicBrainz database searching with Elasticsearch : @@ -111,7 +136,7 @@ French articles on the [javaetmoi.com](http://javaetmoi.com) blog: VersionRelease dateFeatures date - 1.1-SNAPSHOTNext versionElasticsearch 1.0.0 update + 1.1-SNAPSHOT23/08/2015Elasticsearch 1.7.1 update and Docker compose files 1.026/10/2013Initial version developed for a workshop about Elasticsearch (0.90.5) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..f546428 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,11 @@ +postgresql: + build: postgres-dockerfile + ports: + - "5432:5432" + env_file: + - ./postgres-dockerfile/postgres.env +elasticsearch: + image: elasticsearch:1.7.1 + ports: + - "9200:9200" + - "9300:9300" \ No newline at end of file diff --git a/docker/postgres-dockerfile/Dockerfile b/docker/postgres-dockerfile/Dockerfile new file mode 100644 index 0000000..d059167 --- /dev/null +++ b/docker/postgres-dockerfile/Dockerfile @@ -0,0 +1,13 @@ +FROM postgres:9.4 + +RUN apt-get update + +RUN DEBIAN_FRONTEND=noninteractive apt-get -y -q install git-core build-essential libxml2-dev libpq-dev libexpat1-dev libdb-dev libicu-dev postgresql-server-dev-9.4 wget + +RUN git clone https://github.com/metabrainz/postgresql-musicbrainz-unaccent.git && git clone https://github.com/metabrainz/postgresql-musicbrainz-collate.git + +RUN cd postgresql-musicbrainz-unaccent && make && make install && cd ../postgresql-musicbrainz-collate && make && make install && cd ../ + +RUN echo "listen_addresses='*'" >> /var/lib/postgresql/data/postgresql.conf + +ADD create-database.sh /create-database.sh diff --git a/docker/postgres-dockerfile/create-database.sh b/docker/postgres-dockerfile/create-database.sh new file mode 100755 index 0000000..07f9369 --- /dev/null +++ b/docker/postgres-dockerfile/create-database.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +cd /tmp + +echo "Creating Musicbrainz database structure" + +echo "postgresql:5432:musicbrainz:$POSTGRES_USER:$POSTGRES_PASSWORD" > ~/.pgpass +chmod 0600 ~/.pgpass + +psql -h postgresql -d musicbrainz -U $POSTGRES_USER -a -c "CREATE SCHEMA musicbrainz" + +wget https://raw.githubusercontent.com/metabrainz/musicbrainz-server/master/admin/sql/Extensions.sql +psql -h postgresql -d musicbrainz -U $POSTGRES_USER -a -f Extensions.sql +rm Extensions.sql + +wget https://raw.githubusercontent.com/metabrainz/musicbrainz-server/master/admin/sql/CreateTables.sql +psql -h postgresql -d musicbrainz -U $POSTGRES_USER -a -f CreateTables.sql +rm CreateTables.sql + +echo "Downloading last Musicbrainz dump" +wget -nd -nH -P /tmp http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/LATEST +LATEST="$(cat /tmp/LATEST)" +wget -nd -nH -P /tmp http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/$LATEST/mbdump-derived.tar.bz2 +wget -nd -nH -P /tmp http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/$LATEST/mbdump.tar.bz2 + + +echo "Uncompressing Musicbrainz dump" +tar xjf /tmp/mbdump-derived.tar.bz2 +rm mbdump-derived.tar.bz2 +tar xjf /tmp/mbdump.tar.bz2 +rm mbdump.tar.bz2 + +for f in mbdump/* +do + tablename="${f:7}" + echo "Importing $tablename table" + echo "psql -h postgresql -d musicbrainz -U $POSTGRES_USER -a -c COPY $tablename FROM '/tmp/$f'" + chmod a+rX /tmp/$f + psql -h postgresql -d musicbrainz -U $POSTGRES_USER -a -c "\COPY $tablename FROM '/tmp/$f'" +done + +rm -rf mbdump + +echo "Creating Indexes and Primary Keys" + +wget https://raw.githubusercontent.com/metabrainz/musicbrainz-server/master/admin/sql/CreatePrimaryKeys.sql +psql -h postgresql -d musicbrainz -U $POSTGRES_USER -a -f CreatePrimaryKeys.sql +rm CreatePrimaryKeys.sql + +wget https://raw.githubusercontent.com/metabrainz/musicbrainz-server/master/admin/sql/CreateIndexes.sql +psql -h postgresql -d musicbrainz -U $POSTGRES_USER -a -f CreateIndexes.sql +rm CreateIndexes.sql diff --git a/docker/postgres-dockerfile/postgres.env b/docker/postgres-dockerfile/postgres.env new file mode 100644 index 0000000..09ecbc5 --- /dev/null +++ b/docker/postgres-dockerfile/postgres.env @@ -0,0 +1,2 @@ +POSTGRES_USER=musicbrainz +POSTGRES_PASSWORD=musicbrainz \ No newline at end of file diff --git a/src/main/resources/com/javaetmoi/elasticsearch/musicbrainz/batch/es-musicbrainz-batch.properties b/src/main/resources/com/javaetmoi/elasticsearch/musicbrainz/batch/es-musicbrainz-batch.properties index 776c560..83229d9 100644 --- a/src/main/resources/com/javaetmoi/elasticsearch/musicbrainz/batch/es-musicbrainz-batch.properties +++ b/src/main/resources/com/javaetmoi/elasticsearch/musicbrainz/batch/es-musicbrainz-batch.properties @@ -1,6 +1,6 @@ # ElaticSearch configuration -es.host=localhost:9300 -es.cluster.name=musicbrainz +es.host=192.168.59.103:9300 +es.cluster.name=elasticsearch es.index=musicalbum es.settings.filename=com/javaetmoi/elasticsearch/musicbrainz/batch/es-index-settings.json es.ping_timeout=120000 @@ -10,7 +10,7 @@ es.index.timeout=300000 # MusicBrainz PostgreSQL datasource db.musicbrainz.driver.class=org.postgresql.Driver -db.musicbrainz.url=jdbc:postgresql://localhost:5432/musicbrainz +db.musicbrainz.url=jdbc:postgresql://192.168.59.103:5432/musicbrainz db.musicbrainz.user=musicbrainz db.musicbrainz.password=musicbrainz db.musicbrainz.max.pool.size=20