Commit 319d9d8b authored by yuhai's avatar yuhai
Browse files

Initial commit

parents
version: "3"
services:
fastai: &fastai
restart: unless-stopped
working_dir: /data
image: fastai/codespaces
logging:
driver: json-file
options:
max-size: 50m
stdin_open: true
tty: true
volumes:
- .:/data/
notebook:
<<: *fastai
command: bash -c "pip install -e . && jupyter notebook --allow-root --no-browser --ip=0.0.0.0 --port=8080 --NotebookApp.token='' --NotebookApp.password=''"
ports:
- "8080:8080"
watcher:
<<: *fastai
command: watchmedo shell-command --command nbdev_build_docs --pattern *.ipynb --recursive --drop
network_mode: host # for GitHub Codespaces https://github.com/features/codespaces/
jekyll:
<<: *fastai
ports:
- "4000:4000"
command: >
bash -c "pip install .
&& nbdev_build_docs && cd docs
&& bundle i
&& chmod -R u+rwx . && bundle exec jekyll serve --host 0.0.0.0"
source "https://rubygems.org"
gem "jekyll", ">= 3.7"
gem "jekyll-remote-theme"
#################################################
### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
#################################################
# Instead edit ../../sidebar.json
entries:
- folders:
- folderitems:
- output: web,pdf
title: Overview
url: /
- output: web,pdf
title: Iterative_masking
url: core.html
output: web
title: Iterative_masking
output: web
title: Sidebar
---
title: Iterative_masking
keywords: fastai
sidebar: home_sidebar
summary: "Use MSA Transformer to generate synthetic protein sequences by masking iteratively the same MSA."
description: "Use MSA Transformer to generate synthetic protein sequences by masking iteratively the same MSA."
nb_path: "00_core.ipynb"
---
<!--
#################################################
### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
#################################################
# file to edit: 00_core.ipynb
# command to build the docs after a change: nbdev_build_docs
-->
<div class="container" id="notebook-container">
{% raw %}
<div class="cell border-box-sizing code_cell rendered">
</div>
{% endraw %}
{% raw %}
<div class="cell border-box-sizing code_cell rendered">
</div>
{% endraw %}
{% raw %}
<div class="cell border-box-sizing code_cell rendered">
<div class="output_wrapper">
<div class="output">
<div class="output_area">
<div class="output_markdown rendered_html output_subarea ">
<h2 id="IM_MSA_Transformer" class="doc_header"><code>class</code> <code>IM_MSA_Transformer</code><a href="" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>IM_MSA_Transformer</code>(<strong><code>iterations</code></strong>=<em><code>None</code></em>, <strong><code>p_mask</code></strong>=<em><code>None</code></em>, <strong><code>filename</code></strong>=<em><code>None</code></em>, <strong><code>num</code></strong>=<em><code>None</code></em>, <strong><code>filepath</code></strong>=<em><code>None</code></em>)</p>
</blockquote>
<p>Class that implement the Iterative masking algorithm</p>
</div>
</div>
</div>
</div>
</div>
{% endraw %}
{% raw %}
<div class="cell border-box-sizing code_cell rendered">
<div class="output_wrapper">
<div class="output">
<div class="output_area">
<div class="output_markdown rendered_html output_subarea ">
<h4 id="IM_MSA_Transformer.Batch_MSA" class="doc_header"><code>IM_MSA_Transformer.Batch_MSA</code><a href="__main__.py#L303" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>IM_MSA_Transformer.Batch_MSA</code>(<strong><code>use_pdf</code></strong>=<em><code>False</code></em>, <strong><code>simplified</code></strong>=<em><code>False</code></em>, <strong><code>repetitions</code></strong>=<em><code>2</code></em>, <strong><code>sample_all</code></strong>=<em><code>False</code></em>, <strong><code>T</code></strong>=<em><code>1</code></em>, <strong><code>phylo</code></strong>=<em><code>False</code></em>)</p>
</blockquote>
<p>Generate a full MSA by calling with different input MSAs the iterative MSA generator defined
in: <code>self.NEW_MSA</code>.</p>
<p>---&gt; Use this function with <code>simplified</code>=False only if you need tokens in cuda ! (i.e. if you want to compute embed
or contacs), otherwise use <code>simplified</code>=True</p>
<p>The variable <code>self.iterations</code> must be a numpy array which specifies when (at which iterations)
the tokens must be saved. The last element of the array gives the maximum number of iterations that should be done.</p>
<p><code>repetitions</code>: the number of times self.NEW_MSA() is repeated with a different input MSA.</p>
<p><code>use_pdf</code>: if it's True the function sample the token from the logits pdf
instead of getting the argmax (greedy sampling).</p>
<p><code>sample_all</code>: if True all the new tokens are obtained from the logits (both
the masked and the non masked), if False the non masked tokens
are left untouched and only the masked ones are changed.</p>
<p><code>T</code>: Temperature of sampling from the pdf of output logits.</p>
<p><code>phylo</code>: if True the start sequences are sampled from phylogeny weights instead of randomly.</p>
</div>
</div>
</div>
</div>
</div>
{% endraw %}
{% raw %}
<div class="cell border-box-sizing code_cell rendered">
<div class="output_wrapper">
<div class="output">
<div class="output_area">
<div class="output_markdown rendered_html output_subarea ">
<h4 id="IM_MSA_Transformer.Context_MSA" class="doc_header"><code>IM_MSA_Transformer.Context_MSA</code><a href="__main__.py#L448" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>IM_MSA_Transformer.Context_MSA</code>(<strong><code>depth</code></strong>=<em><code>None</code></em>, <strong><code>ancestor</code></strong>=<em><code>None</code></em>, <strong><code>context</code></strong>=<em><code>None</code></em>, <strong><code>use_pdf</code></strong>=<em><code>False</code></em>, <strong><code>simplified</code></strong>=<em><code>False</code></em>, <strong><code>sample_all</code></strong>=<em><code>False</code></em>, <strong><code>print_all</code></strong>=<em><code>True</code></em>, <strong><code>T</code></strong>=<em><code>1</code></em>)</p>
</blockquote>
<p>Generates a new MSA with context-generation by iterating the masking on the original ancestor sequence
using: <code>self.generate_MSA_context</code>. It masks <code>ancestor</code> (original sequence) and uses the sequences in <code>context</code> as context MSA.</p>
<p>---&gt; Use this function with <code>simplified</code>=False only if you need tokens in cuda ! (i.e. if you want to compute embed
or contacs), otherwise use <code>simplified</code>=True</p>
<p>The variable <code>self.iterations</code> must be a numpy array which specifies when (at which iterations)
the tokens must be saved. The last element of the array gives the maximum number of iterations that should be done.
If <code>print_all</code>=True then it saves the generated sequences at each iteration.</p>
<p><code>ancestor</code>: input sequence to be masked iteratively.</p>
<p><code>context</code>: context MSA (not masked).</p>
<p><code>use_pdf</code>: if it's True the function sample the token from the logits pdf
instead of getting the argmax (greedy sampling).</p>
<p><code>sample_all</code>: if True all the new tokens are obtained from the logits (both
the masked and the non masked), if False the non masked tokens
are left untouched and only the masked ones are changed.</p>
<p><code>T</code>: Temperature of sampling from the pdf of output logits.</p>
<p><code>depth</code>: number of generated sequences, if None the depth is the number of ancestor sequences.</p>
</div>
</div>
</div>
</div>
</div>
{% endraw %}
{% raw %}
<div class="cell border-box-sizing code_cell rendered">
</div>
{% endraw %}
{% raw %}
<div class="cell border-box-sizing code_cell rendered">
<div class="output_wrapper">
<div class="output">
<div class="output_area">
<div class="output_markdown rendered_html output_subarea ">
<h4 id="gen_MSAs" class="doc_header"><code>gen_MSAs</code><a href="__main__.py#L6" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>gen_MSAs</code>(<strong><code>filepath</code></strong>:"Path of the input directory", <strong><code>filename</code></strong>:"Name of the input file(s)", <strong><code>new_dir</code></strong>:"Name of the output directory", <strong><code>pdf</code></strong>:"Should I sample tokens from the pdf ? (bool)", <strong><code>T</code></strong>:"Which is the sampling Temperature from the pdf ? (only when <code>pdf</code> is True)", <strong><code>sample_all</code></strong>:"Should I sample all tokens or just the masked ones ? (True = sample all tokens)", <strong><code>Iters</code></strong>:"Number of total iterations to generate the new tokens", <strong><code>pmask</code></strong>:"Masking probability", <strong><code>num</code></strong>:"Size of the batches MSAs which the MSA-Transformer receives as input", <strong><code>depth</code></strong>:"Number of batches (of size num) that you want to generate", <strong><code>generate</code></strong>:"How should I generate sequences ? False (=Batch generation) or Linear with context (=linear-ran/linear-tot-ran), <code>-ran</code> means that the context MSA is sampled randomly (once) while <code>-tot-ran</code> means that it is sampled randomly each time.", <strong><code>print_all</code></strong>:"Should I print the MSA after each iteration ? (bool)", <strong><code>range_vals</code></strong>:"First and last index of the sequences that you want to use as ancestors", <strong><code>phylo_w</code></strong>:"Should I sample the starting sequences from the phylogeny weights ? (bool)")</p>
</blockquote>
<p>Generate a new MSA either with Batch generation of Context generation. It shuffles the initial MSA and uses different slices as batch MSAs</p>
</div>
</div>
</div>
</div>
</div>
{% endraw %}
<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h2 id="Build-library">Build library<a class="anchor-link" href="#Build-library"> </a></h2>
</div>
</div>
</div>
</div>
---
search: exclude
layout: none
---
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>{{ site.title | xml_escape }}</title>
<description>{{ site.description | xml_escape }}</description>
<link>{{ site.url }}/</link>
<atom:link href="{{ "/feed.xml" | prepend: site.url }}" rel="self" type="application/rss+xml"/>
<pubDate>{{ site.time | date_to_rfc822 }}</pubDate>
<lastBuildDate>{{ site.time | date_to_rfc822 }}</lastBuildDate>
<generator>Jekyll v{{ jekyll.version }}</generator>
{% for post in site.posts limit:10 %}
<item>
<title>{{ post.title | xml_escape }}</title>
<description>{{ post.content | xml_escape }}</description>
<pubDate>{{ post.date | date_to_rfc822 }}</pubDate>
<link>{{ post.url | prepend: site.url }}</link>
<guid isPermaLink="true">{{ post.url | prepend: site.url }}</guid>
{% for tag in post.tags %}
<category>{{ tag | xml_escape }}</category>
{% endfor %}
{% for tag in page.tags %}
<category>{{ cat | xml_escape }}</category>
{% endfor %}
</item>
{% endfor %}
</channel>
</rss>
---
title: Iterative_masking
keywords: fastai
sidebar: home_sidebar
summary: "Supporting repository for: "Generative power of a protein language model trained on multiple sequence alignments" (preprint: https://doi.org/10.1101/2022.04.14.488405). We use MSA Transformer (https://doi.org/10.1101/2021.02.12.430858) to generate synthetic protein sequences by masking iteratively the same MSA."
description: "Supporting repository for: "Generative power of a protein language model trained on multiple sequence alignments" (preprint: https://doi.org/10.1101/2022.04.14.488405). We use MSA Transformer (https://doi.org/10.1101/2021.02.12.430858) to generate synthetic protein sequences by masking iteratively the same MSA."
nb_path: "index.ipynb"
---
<!--
#################################################
### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
#################################################
# file to edit: index.ipynb
# command to build the docs after a change: nbdev_build_docs
-->
<div class="container" id="notebook-container">
{% raw %}
<div class="cell border-box-sizing code_cell rendered">
</div>
{% endraw %}
<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h2 id="Getting-started">Getting started<a class="anchor-link" href="#Getting-started"> </a></h2><p>Clone this repository on your local machine by running:</p>
<div class="highlight"><pre><span></span>git clone git@github.com:Bitbol-Lab/Iterative_masking.git
</pre></div>
<p>and move inside the root folder.
One can the use directly the functions from the cloned repository (in the folder <code>Iterative_masking</code>) or install it with an editable install running:</p>
<div class="highlight"><pre><span></span>pip install -e .
</pre></div>
<p>We recommend creating and activating a dedicated <code>conda</code> or <code>virtualenv</code> Python virtual environment.</p>
<h2 id="Requirements">Requirements<a class="anchor-link" href="#Requirements"> </a></h2><p>In order to use the functions, the following python packages are required:</p>
<ul>
<li>numpy</li>
<li>scipy</li>
<li>numba</li>
<li>fastcore</li>
<li>biopython</li>
<li>esm==0.4.0</li>
<li>pytorch</li>
</ul>
<p>It is also required to use a GPU (with cuda).</p>
</div>
</div>
</div>
<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h2 id="How-to-use">How to use<a class="anchor-link" href="#How-to-use"> </a></h2>
</div>
</div>
</div>
<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p><a href="/Iterative_masking/core.html#IM_MSA_Transformer"><code>IM_MSA_Transformer</code></a>: Class with different functions used to generate new MSAs with the iterative masking procedure</p>
<p><a href="/Iterative_masking/core.html#gen_MSAs"><code>gen_MSAs</code></a>: example function (with parser) that can be used to generate and save new sequences directly from the terminal.</p>
</div>
</div>
</div>
</div>
{
"Iterative_masking": {
"Overview": "/",
"Iterative_masking": "core.html"
}
}
\ No newline at end of file
---
layout: none
search: exclude
---
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{% for post in site.posts %}
{% unless post.search == "exclude" %}
<url>
<loc>{{site.url}}{{post.url}}</loc>
</url>
{% endunless %}
{% endfor %}
{% for page in site.pages %}
{% unless page.search == "exclude" %}
<url>
<loc>{{site.url}}{{ page.url}}</loc>
</url>
{% endunless %}
{% endfor %}
</urlset>
\ No newline at end of file
# Created by .ignore support plugin (hsz.mobi)
### Go template
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof
.idea
/bin/
/log/
/pkg/
*.iml
/src
language: go
os:
- osx
go:
- 1.10
install:
- go get github.com/mitchellh/gox
script: make cross-gox-build-all-platform package-all-platform
deploy:
provider: releases
skip_cleanup: true
api_key:
secure: E7POmQPK3yM4X3XI/WBSc9zFQ5hc7DuDr3ECOnXdkEbq5Uec0WcV9EZHmiMhPwd/dNKQ5/fVGdP+R9RsMtllMG29r6WWVTOdoZP/upWUn2rH7ZzpUuTCo2rXgKjFSw4iu8qzs0Rg7kMcvepKp8kvN1qcujri8sRz3K8ntHz3krEE2+aoNZQZBKHrRoKoQyBaouVQjVGIYI+Pafyyco0zByt1ijorWK04aCCx9xwoRLw6CS1RJVtk98I5EPDhOr0/madYYkVIQj5tpqv7BIUzT+cVLXfXWh9D2XrUuv1ui/1FvHXH/wuxkplWhMAGQaDRF9O86HtAHN+8EC/0VchTNQbVcBScw1gu3R4wPhU7xyfZQBa86TgV3CdfJGiSDkqIY1D7M+XF8f8XOJ6yaKA5Vjh83B3u7aeCwnLRf9appJsxY5YECqBua3sQJnxLT9wv8ObRd+7Yb8itXW6CN8iqcwnS2oIvKJPpJ99fcE4DPrPprONPiQiBse8378f8p6/YR0K+S3ai4XOSBO/qnZLHsKzVerwhIabMEjgpV7t0F+LegsmvToHkhgK0ArMyh9IHWgfwFLq1C/MOJGF5FLS09TrOmuNEeYrHheTrd2KLezUKX5ujWp0H8Rddks62baIJkYHYbBPYEQLWcmqG4kI6kAYBJC+5LGgoNLJUye/Mnyo=
file: 'bin/*.tar.gz'
file_glob: true
on:
repo: medcl/elasticsearch-migration
tags: true
SHELL=/bin/bash
CWD=$(shell pwd)
OLDGOPATH=${GOPATH}
NEWGOPATH:=${CWD}:${OLDGOPATH}
export GOPATH=$(NEWGOPATH)
PATH := $(PATH):$(GOPATH)/bin
build: clean config
go build -o bin/esm
tar: build
tar cfz bin/esm.tar.gz bin/esm
cross-build-all-platform: clean config
go test
GOOS=windows GOARCH=amd64 go build -o bin/windows64/esm.exe
GOOS=windows GOARCH=386 go build -o bin/windows32/esm.exe
GOOS=darwin GOARCH=amd64 go build -o bin/darwin64/esm
GOOS=darwin GOARCH=386 go build -o bin/darwin32/esm
GOOS=linux GOARCH=amd64 go build -o bin/linux64/esm
GOOS=linux GOARCH=386 go build -o bin/linux32/esm
GOOS=linux GOARCH=arm go build -o bin/linux_arm/esm
GOOS=freebsd GOARCH=amd64 go build -o bin/freebsd64/esm
GOOS=freebsd GOARCH=386 go build -o bin/freebsd32/esm
GOOS=netbsd GOARCH=amd64 go build -o bin/netbsd64/esm
GOOS=netbsd GOARCH=386 go build -o bin/netbsd32/esm
GOOS=openbsd GOARCH=amd64 go build -o bin/openbsd64/esm
GOOS=openbsd GOARCH=386 go build -o bin/openbsd32/esm
gox-cross-build-all-platform: clean config
go get github.com/mitchellh/gox
go test
gox -output="bin/esm_{{.OS}}_{{.Arch}}"
cross-gox-build-all-platform: clean config
go get github.com/mitchellh/gox
go test
gox -os=windows -arch=amd64 -output="bin/windows64/esm"
gox -os=windows -arch=386 -output=bin/windows32/esm
gox -os=darwin -arch=amd64 -output=bin/darwin64/esm
gox -os=darwin -arch=386 -output=bin/darwin32/esm
gox -os=linux -arch=amd64 -output=bin/linux64/esm
gox -os=linux -arch=386 -output=bin/linux32/esm
gox -os=linux -arch=arm -output=bin/linux_arm/esm
gox -os=freebsd -arch=amd64 -output=bin/freebsd64/esm
gox -os=freebsd -arch=386 -output=bin/freebsd32/esm
gox -os=netbsd -arch=amd64 -output=bin/netbsd64/esm
gox -os=netbsd -arch=386 -output=bin/netbsd32/esm
gox -os=openbsd -arch=amd64 -output=bin/openbsd64/esm
gox -os=openbsd -arch=386 -output=bin/openbsd32/esm
cross-build: clean config
go test
GOOS=windows GOARCH=amd64 go build -o bin/windows64/esm.exe
GOOS=darwin GOARCH=amd64 go build -o bin/darwin64/esm
GOOS=linux GOARCH=amd64 go build -o bin/linux64/esm
all: clean config cross-build
all-platform: clean config cross-build-all-platform
format:
gofmt -s -w -tabs=false -tabwidth=4 main.go
clean:
rm -rif bin
mkdir bin
config:
@echo "get Dependencies"
go env
go get gopkg.in/cheggaaa/pb.v1
go get github.com/jessevdk/go-flags
go get github.com/olekukonko/ts
go get github.com/cihub/seelog
go get github.com/parnurzeal/gorequest
dist: cross-build package
dist-all: all package
dist-all-platform: all-platform package-all-platform
package:
@echo "Packaging"
tar cfz bin/windows64.tar.gz bin/windows64/esm.exe
tar cfz bin/darwin64.tar.gz bin/darwin64/esm
tar cfz bin/linux64.tar.gz bin/linux64/esm
package-all-platform:
@echo "Packaging"
tar cfz bin/windows64.tar.gz bin/windows64/esm.exe
tar cfz bin/windows32.tar.gz bin/windows32/esm.exe
tar cfz bin/darwin64.tar.gz bin/darwin64/esm
tar cfz bin/darwin32.tar.gz bin/darwin32/esm
tar cfz bin/linux64.tar.gz bin/linux64/esm
tar cfz bin/linux32.tar.gz bin/linux32/esm
tar cfz bin/linux_arm.tar.gz bin/linux_arm/esm
tar cfz bin/freebsd64.tar.gz bin/freebsd64/esm
tar cfz bin/freebsd32.tar.gz bin/freebsd32/esm
tar cfz bin/netbsd64.tar.gz bin/netbsd64/esm
tar cfz bin/netbsd32.tar.gz bin/netbsd32/esm
tar cfz bin/openbsd64.tar.gz bin/openbsd64/esm
tar cfz bin/openbsd32.tar.gz bin/openbsd32/esm
cross-compile:
@echo "Prepare Cross Compiling"
cd $(GOROOT)/src && GOOS=windows GOARCH=amd64 ./make.bash --no-clean
cd $(GOROOT)/src && GOOS=darwin GOARCH=amd64 ./make.bash --no-clean 2> /dev/null 1> /dev/null
cd $(GOROOT)/src && GOOS=linux GOARCH=amd64 ./make.bash --no-clean 2> /dev/null 1> /dev/null
cd $(CWD)
# An Elasticsearch Migration Tool
Support cross version and http basic auth.
[![asciicast](https://asciinema.org/a/e562wy1ro30yboznkj5f539md.png)](https://asciinema.org/a/e562wy1ro30yboznkj5f539md)
## Features:
* Cross version migration supported
* Overwrite index name
* Copy index settings and mapping
* Support http basic auth
* Support dump into local file
* Support loading from local file
* Support http proxy
* Support sliced scroll (only for elasticsearch 5.0)
## Example:
copy index `index_name` from `192.168.1.x` to `192.168.1.y:9200`
```
./bin/esm -s http://192.168.1.x:9200 -d http://192.168.1.y:9200 -x index_name -w=5 -b=10 -c 10000
```
copy index `src_index` from `192.168.1.x` to `192.168.1.y:9200` and save with `dest_index`
```
./bin/esm -s http://localhost:9200 -d http://localhost:9200 -x src_index -y dest_index -w=5 -b=100
```
support Basic-Auth
```
./bin/esm -s http://localhost:9200 -x "src_index" -y "dest_index" -d http://localhost:9201 -n admin:111111
```
copy settings and override shard size
```
./bin/esm -s http://localhost:9200 -x "src_index" -y "dest_index" -d http://localhost:9201 -m admin:111111 -c 10000 --shards=50 --copy_settings
```
copy settings and mapping, recreate target index, add query to source fetch, refresh after migration
```
./bin/esm -s http://localhost:9200 -x "src_index" -q=query:phone -y "dest_index" -d http://localhost:9201 -c 10000 --shards=5 --copy_settings --copy_mapping --force --refresh
```
dump elasticsearch documents into local file
```
./bin/esm -s http://localhost:9200 -x "src_index" -m admin:111111 -c 5000 -b -q=query:mixer --refresh -o=dump.bin
```
loading data from dump files, bulk insert to another es instance
```
./bin/esm -d http://localhost:9200 -y "dest_index" -n admin:111111 -c 5000 -b 5 --refresh -i=dump.bin
```
support proxy
```
./bin/esm -d http://123345.ap-northeast-1.aws.found.io:9200 -y "dest_index" -n admin:111111 -c 5000 -b 1 --refresh -i dump.bin --dest_proxy=http://127.0.0.1:9743
```
use sliced scroll(only available in elasticsearch v5) to speed scroll, and update shard number
```
./bin/esm -s=http://192.168.3.206:9200 -d=http://localhost:9200 -n=elastic:changeme -f --copy_settings --copy_mappings -x=bestbuykaggle --sliced_scroll_size=5 --shards=50 --refresh
```
## Download
https://github.com/medcl/elasticsearch-dump/releases
## Compile:
if download version is not fill you environment,you may try to compile it yourself. `go` required.
`make build`
## Options
```
-s, --source= source elasticsearch instance
-d, --dest= destination elasticsearch instance
-q, --query= query against source elasticsearch instance, filter data before migrate, ie: name:medcl
-m, --source_auth basic auth of source elasticsearch instance, ie: user:pass
-n, --dest_auth basic auth of target elasticsearch instance, ie: user:pass
-c, --count= number of documents at a time: ie "size" in the scroll request (10000)
--sliced_scroll_size= size of sliced scroll, to make it work, the size should be > 1, default:"1"
-t, --time= scroll time (1m)
--shards= set a number of shards on newly created indexes
--copy_settings copy index settings from source
--copy_mappings copy mappings mappings from source
-f, --force delete destination index before copying, default:false
-x, --src_indexes= list of indexes to copy, comma separated (_all), support wildcard match(*)
-y, --dest_index= indexes name to save, allow only one indexname, original indexname will be used if not specified
-a, --all copy indexes starting with . and _ (false)
-w, --workers= concurrency number for bulk workers, default is: "1"
-b --bulk_size bulk size in MB" default:5
-v --log setting log level,options:trace,debug,info,warn,error
-i --input_file indexing from local dump file, file format: {"_id":"xxx","_index":"xxx","_source":{"xxx":"xxx"},"_type":"xxx" }
-o --output_file output documents of source index into local file, file format same as input_file.
--source_proxy set proxy to source http connections, ie: http://127.0.0.1:8080
--dest_proxy set proxy to destination http connections, ie: http://127.0.0.1:8080
--refresh refresh after migration finished
```
Versions
--------
From | To
-----------|-----------
1.x | 1.x
1.x | 2.x
1.x | 5.0
2.x | 1.x
2.x | 2.x
2.x | 5.0
5.0 | 1.x
5.0 | 2.x
5.0 | 5.0
/*
Copyright 2016 Medcl (m AT medcl.net)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"sync"
log "github.com/cihub/seelog"
"encoding/json"
"bytes"
"gopkg.in/cheggaaa/pb.v1"
"time"
)
func (c *Migrator) NewBulkWorker(docCount *int, pb *pb.ProgressBar, wg *sync.WaitGroup) {
log.Debug("start es bulk worker")
bulkItemSize := 0
mainBuf := bytes.Buffer{}
docBuf := bytes.Buffer{}
docEnc := json.NewEncoder(&docBuf)
READ_DOCS:
for {
select {
case docI, open := <-c.DocChan:
var err error
log.Trace("read doc from channel,", docI)
// this check is in case the document is an error with scroll stuff
if status, ok := docI["status"]; ok {
if status.(int) == 404 {
log.Error("error: ", docI["response"])
continue
}
}
// sanity check
for _, key := range []string{"_index", "_type", "_source", "_id"} {
if _, ok := docI[key]; !ok {
//json,_:=json.Marshal(docI)
//log.Errorf("failed parsing document: %v", string(json))
break READ_DOCS
}
}
var tempDestIndexName string
tempDestIndexName = docI["_index"].(string)
if c.Config.TargetIndexName != "" {
tempDestIndexName = c.Config.TargetIndexName
}
doc := Document{
Index: tempDestIndexName,
Type: docI["_type"].(string),
source: docI["_source"].(map[string]interface{}),
Id: docI["_id"].(string),
}
// if channel is closed flush and gtfo
if !open {
goto WORKER_DONE
}
// sanity check
if len(doc.Index) == 0 || len(doc.Id) == 0 || len(doc.Type) == 0 {
log.Errorf("failed decoding document: %+v", doc)
continue
}
// encode the doc and and the _source field for a bulk request
post := map[string]Document{
"create": doc,
}
if err = docEnc.Encode(post); err != nil {
log.Error(err)
}
if err = docEnc.Encode(doc.source); err != nil {
log.Error(err)
}
// if we approach the 100mb es limit, flush to es and reset mainBuf
if mainBuf.Len() + docBuf.Len() > (c.Config.BulkSizeInMB * 1000000) {
goto CLEAN_BUFFER
}
// append the doc to the main buffer
mainBuf.Write(docBuf.Bytes())
// reset for next document
bulkItemSize++
docBuf.Reset()
(*docCount)++
case <-time.After(time.Second * 5):
log.Debug("5s no message input")
goto CLEAN_BUFFER
case <-time.After(time.Minute * 5):
log.Warn("5m no message input, close worker")
goto WORKER_DONE
}
goto READ_DOCS
CLEAN_BUFFER:
c.TargetESAPI.Bulk(&mainBuf)
log.Trace("clean buffer, and execute bulk insert")
pb.Add(bulkItemSize)
bulkItemSize = 0
}
WORKER_DONE:
if docBuf.Len() > 0 {
mainBuf.Write(docBuf.Bytes())
bulkItemSize++
}
c.TargetESAPI.Bulk(&mainBuf)
log.Trace("bulk insert")
pb.Add(bulkItemSize)
bulkItemSize = 0
wg.Done()
}
/*
Copyright 2016 Medcl (m AT medcl.net)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import "sync"
type Indexes map[string]interface{}
type Document struct {
Index string `json:"_index"`
Type string `json:"_type"`
Id string `json:"_id"`
source map[string]interface{} `json:"_source"`
}
type Scroll struct {
Took int `json:"took"`
ScrollId string `json:"_scroll_id"`
TimedOut bool `json:"timed_out"`
Hits struct {
MaxScore float32 `json:"max_score"`
Total int `json:"total"`
Docs []interface{} `json:"hits"`
} `json:"hits"`
Shards struct {
Total int `json:"total"`
Successful int `json:"successful"`
Failed int `json:"failed"`
Failures []struct {
Shard int `json:"shard"`
Index string `json:"index"`
Status int `json:"status"`
Reason interface{} `json:"reason"`
} `json:"failures"`
} `json:"_shards"`
}
type ClusterVersion struct{
Name string `json:"name"`
ClusterName string `json:"cluster_name"`
Version struct {
Number string `json:"number"`
LuceneVersion string `json:"lucene_version"`
} `json:"version"`
}
type ClusterHealth struct {
Name string `json:"cluster_name"`
Status string `json:"status"`
}
type Migrator struct{
FlushLock sync.Mutex
DocChan chan map[string]interface{}
SourceESAPI ESAPI
TargetESAPI ESAPI
SourceAuth *Auth
TargetAuth *Auth
Config *Config
}
type Config struct {
// config options
SourceEs string `short:"s" long:"source" description:"source elasticsearch instance, ie: http://localhost:9200"`
Query string `short:"q" long:"query" description:"query against source elasticsearch instance, filter data before migrate, ie: name:medcl"`
TargetEs string `short:"d" long:"dest" description:"destination elasticsearch instance, ie: http://localhost:9201"`
SourceEsAuthStr string `short:"m" long:"source_auth" description:"basic auth of source elasticsearch instance, ie: user:pass"`
TargetEsAuthStr string `short:"n" long:"dest_auth" description:"basic auth of target elasticsearch instance, ie: user:pass"`
DocBufferCount int `short:"c" long:"count" description:"number of documents at a time: ie \"size\" in the scroll request" default:"10000"`
Workers int `short:"w" long:"workers" description:"concurrency number for bulk workers" default:"1"`
BulkSizeInMB int `short:"b" long:"bulk_size" description:"bulk size in MB" default:"5"`
ScrollTime string `short:"t" long:"time" description:"scroll time" default:"1m"`
ScrollSliceSize int `long:"sliced_scroll_size" description:"size of sliced scroll, to make it work, the size should be > 1" default:"1"`
RecreateIndex bool `short:"f" long:"force" description:"delete destination index before copying" default:"false"`
CopyAllIndexes bool `short:"a" long:"all" description:"copy indexes starting with . and _"`
CopyIndexSettings bool `long:"copy_settings" description:"copy index settings from source" default:"false"`
CopyIndexMappings bool `long:"copy_mappings" description:"copy index mappings from source" default:"false"`
ShardsCount int `long:"shards" description:"set a number of shards on newly created indexes"`
SourceIndexNames string `short:"x" long:"src_indexes" description:"indexes name to copy,support regex and comma separated list" default:"_all"`
TargetIndexName string `short:"y" long:"dest_index" description:"indexes name to save, allow only one indexname, original indexname will be used if not specified" default:""`
WaitForGreen bool `long:"green" description:"wait for both hosts cluster status to be green before dump. otherwise yellow is okay"`
LogLevel string `short:"v" long:"log" description:"setting log level,options:trace,debug,info,warn,error" default:"INFO"`
DumpOutFile string `short:"o" long:"output_file" description:"output documents of source index into local file" `
DumpInputFile string `short:"i" long:"input_file" description:"indexing from local dump file" `
SourceProxy string `long:"source_proxy" description:"set proxy to source http connections, ie: http://127.0.0.1:8080"`
TargetProxy string `long:"dest_proxy" description:"set proxy to target http connections, ie: http://127.0.0.1:8080"`
Refresh bool `long:"refresh" description:"refresh after migration finished"`
Fields string `long:"fields" description:"output fields, comma separated, ie: col1,col2,col3,..." `
}
type Auth struct {
User string
Pass string
}
/*
Copyright 2016 Medcl (m AT medcl.net)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import "bytes"
type ESAPI interface{
ClusterHealth() *ClusterHealth
Bulk(data *bytes.Buffer)
GetIndexSettings(indexNames string) (*Indexes, error)
DeleteIndex(name string) (error)
CreateIndex(name string,settings map[string]interface{}) (error)
GetIndexMappings(copyAllIndexes bool,indexNames string)(string,int,*Indexes,error)
UpdateIndexSettings(indexName string,settings map[string]interface{})(error)
UpdateIndexMapping(indexName string,mappings map[string]interface{})(error)
NewScroll(indexNames string,scrollTime string,docBufferCount int,query string, slicedId,maxSlicedCount int, fields string)(*Scroll, error)
NextScroll(scrollTime string,scrollId string)(*Scroll,error)
Refresh(name string) (err error)
}
/*
Copyright 2016 Medcl (m AT medcl.net)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"sync"
"gopkg.in/cheggaaa/pb.v1"
log "github.com/cihub/seelog"
"os"
"bufio"
"encoding/json"
"io"
)
func checkFileIsExist(filename string) (bool) {
var exist = true;
if _, err := os.Stat(filename); os.IsNotExist(err) {
exist = false;
}
return exist;
}
func (m *Migrator) NewFileReadWorker(pb *pb.ProgressBar, wg *sync.WaitGroup) {
log.Debug("start reading file")
f, err := os.Open(m.Config.DumpInputFile)
if err != nil {
log.Error(err)
return
}
defer f.Close()
r := bufio.NewReader(f)
lineCount := 0
for{
line,err := r.ReadString('\n')
if io.EOF == err || nil != err{
break
}
lineCount += 1
js := map[string]interface{}{}
//log.Trace("reading file,",lineCount,",", line)
err = json.Unmarshal([]byte(line), &js)
if(err!=nil){
log.Error(err)
continue
}
m.DocChan <- js
pb.Increment()
}
defer f.Close()
log.Debug("end reading file")
close(m.DocChan)
wg.Done()
}
func (c *Migrator) NewFileDumpWorker(pb *pb.ProgressBar, wg *sync.WaitGroup) {
var f *os.File
var err1 error;
if checkFileIsExist(c.Config.DumpOutFile) {
f, err1 = os.OpenFile(c.Config.DumpOutFile, os.O_APPEND|os.O_WRONLY, os.ModeAppend)
if(err1!=nil){
log.Error(err1)
return
}
}else {
f, err1 = os.Create(c.Config.DumpOutFile)
if(err1!=nil){
log.Error(err1)
return
}
}
w := bufio.NewWriter(f)
READ_DOCS:
for {
docI, open := <-c.DocChan
// this check is in case the document is an error with scroll stuff
if status, ok := docI["status"]; ok {
if status.(int) == 404 {
log.Error("error: ", docI["response"])
continue
}
}
// sanity check
for _, key := range []string{"_index", "_type", "_source", "_id"} {
if _, ok := docI[key]; !ok {
//json,_:=json.Marshal(docI)
//log.Errorf("failed parsing document: %v", string(json))
break READ_DOCS
}
}
jsr,err:=json.Marshal(docI)
log.Trace(string(jsr))
if(err!=nil){
log.Error(err)
}
n,err:=w.WriteString(string(jsr))
if(err!=nil){
log.Error(n,err)
}
w.WriteString("\n")
pb.Increment()
// if channel is closed flush and gtfo
if !open {
goto WORKER_DONE
}
}
WORKER_DONE:
w.Flush()
f.Close()
wg.Done()
log.Debug("file dump finished")
}
module esm-0.4.0
go 1.20
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment