Initial commit

319d9d8b · yuhai · 319d9d8b · 319d9d8b · 319d9d8b · 319d9d8b
Commit 319d9d8b authored Aug 02, 2023 by yuhai
20 changed files
--- a/data_new/results/dictionary-tokens.pkl
+++ b/data_new/results/dictionary-tokens.pkl
--- a/data_new/results/original-tokens.npy
+++ b/data_new/results/original-tokens.npy
--- a/docker-compose.yml
+++ b/docker-compose.yml
+version: "3"
+services:
+  fastai: &fastai
+    restart: unless-stopped
+    working_dir: /data
+    image: fastai/codespaces
+    logging:
+      driver: json-file
+      options:
+        max-size: 50m
+    stdin_open: true
+    tty: true
+    volumes:
+      - .:/data/
+
+  notebook:
+    <<: *fastai
+    command: bash -c "pip install -e . && jupyter notebook --allow-root --no-browser --ip=0.0.0.0 --port=8080 --NotebookApp.token='' --NotebookApp.password=''"
+    ports:
+      - "8080:8080"
+
+  watcher:
+    <<: *fastai
+    command: watchmedo shell-command --command nbdev_build_docs --pattern *.ipynb --recursive --drop
+    network_mode: host # for GitHub Codespaces https://github.com/features/codespaces/
+
+  jekyll:
+    <<: *fastai
+    ports:
+     - "4000:4000"
+    command: >
+     bash -c "pip install .
+     && nbdev_build_docs && cd docs
+     && bundle i
+     && chmod -R u+rwx . && bundle exec jekyll serve --host 0.0.0.0"
--- a/docs/.gitignore
+++ b/docs/.gitignore
+_site/
--- a/docs/Gemfile
+++ b/docs/Gemfile
+source "https://rubygems.org"
+
+gem "jekyll", ">= 3.7"
+gem "jekyll-remote-theme"
--- a/docs/_data/sidebars/home_sidebar.yml
+++ b/docs/_data/sidebars/home_sidebar.yml
+
+#################################################
+### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
+#################################################
+# Instead edit ../../sidebar.json
+entries:
+- folders:
+  - folderitems:
+    - output: web,pdf
+      title: Overview
+      url: /
+    - output: web,pdf
+      title: Iterative_masking
+      url: core.html
+    output: web
+    title: Iterative_masking
+  output: web
+  title: Sidebar
--- a/docs/core.html
+++ b/docs/core.html
+---
+
+title: Iterative_masking
+
+
+keywords: fastai
+sidebar: home_sidebar
+
+summary: "Use MSA Transformer to generate synthetic protein sequences by masking iteratively the same MSA."
+description: "Use MSA Transformer to generate synthetic protein sequences by masking iteratively the same MSA."
+nb_path: "00_core.ipynb"
+---
+<!--
+
+#################################################
+### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
+#################################################
+# file to edit: 00_core.ipynb
+# command to build the docs after a change: nbdev_build_docs
+
+-->
+
+<div class="container" id="notebook-container">
+        
+    {% raw %}
+    
+<div class="cell border-box-sizing code_cell rendered">
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+    
+<div class="cell border-box-sizing code_cell rendered">
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+    
+<div class="cell border-box-sizing code_cell rendered">
+
+<div class="output_wrapper">
+<div class="output">
+
+<div class="output_area">
+
+
+<div class="output_markdown rendered_html output_subarea ">
+<h2 id="IM_MSA_Transformer" class="doc_header"><code>class</code> <code>IM_MSA_Transformer</code><a href="" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>IM_MSA_Transformer</code>(<strong><code>iterations</code></strong>=<em><code>None</code></em>, <strong><code>p_mask</code></strong>=<em><code>None</code></em>, <strong><code>filename</code></strong>=<em><code>None</code></em>, <strong><code>num</code></strong>=<em><code>None</code></em>, <strong><code>filepath</code></strong>=<em><code>None</code></em>)</p>
+</blockquote>
+<p>Class that implement the Iterative masking algorithm</p>
+
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+    
+<div class="cell border-box-sizing code_cell rendered">
+
+<div class="output_wrapper">
+<div class="output">
+
+<div class="output_area">
+
+
+<div class="output_markdown rendered_html output_subarea ">
+<h4 id="IM_MSA_Transformer.Batch_MSA" class="doc_header"><code>IM_MSA_Transformer.Batch_MSA</code><a href="__main__.py#L303" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>IM_MSA_Transformer.Batch_MSA</code>(<strong><code>use_pdf</code></strong>=<em><code>False</code></em>, <strong><code>simplified</code></strong>=<em><code>False</code></em>, <strong><code>repetitions</code></strong>=<em><code>2</code></em>, <strong><code>sample_all</code></strong>=<em><code>False</code></em>, <strong><code>T</code></strong>=<em><code>1</code></em>, <strong><code>phylo</code></strong>=<em><code>False</code></em>)</p>
+</blockquote>
+<p>Generate a full MSA by calling with different input MSAs the iterative MSA generator defined
+in: <code>self.NEW_MSA</code>.</p>
+<p>---&gt; Use this function with <code>simplified</code>=False only if you need tokens in cuda ! (i.e. if you want to compute embed
+     or contacs), otherwise use <code>simplified</code>=True</p>
+<p>The variable <code>self.iterations</code> must be a numpy array which specifies when (at which iterations)
+the tokens must be saved. The last element of the array gives the maximum number of iterations that should be done.</p>
+<p><code>repetitions</code>:      the number of times self.NEW_MSA() is repeated with a different input MSA.</p>
+<p><code>use_pdf</code>:    if it's True the function sample the token from the logits pdf 
+            instead of getting the argmax (greedy sampling).</p>
+<p><code>sample_all</code>: if True all the new tokens are obtained from the logits (both
+            the masked and the non masked), if False the non masked tokens
+            are left untouched and only the masked ones are changed.</p>
+<p><code>T</code>:          Temperature of sampling from the pdf of output logits.</p>
+<p><code>phylo</code>:            if True the start sequences are sampled from phylogeny weights instead of randomly.</p>
+
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+    
+<div class="cell border-box-sizing code_cell rendered">
+
+<div class="output_wrapper">
+<div class="output">
+
+<div class="output_area">
+
+
+<div class="output_markdown rendered_html output_subarea ">
+<h4 id="IM_MSA_Transformer.Context_MSA" class="doc_header"><code>IM_MSA_Transformer.Context_MSA</code><a href="__main__.py#L448" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>IM_MSA_Transformer.Context_MSA</code>(<strong><code>depth</code></strong>=<em><code>None</code></em>, <strong><code>ancestor</code></strong>=<em><code>None</code></em>, <strong><code>context</code></strong>=<em><code>None</code></em>, <strong><code>use_pdf</code></strong>=<em><code>False</code></em>, <strong><code>simplified</code></strong>=<em><code>False</code></em>, <strong><code>sample_all</code></strong>=<em><code>False</code></em>, <strong><code>print_all</code></strong>=<em><code>True</code></em>, <strong><code>T</code></strong>=<em><code>1</code></em>)</p>
+</blockquote>
+<p>Generates a new MSA with context-generation by iterating the masking on the original ancestor sequence
+using: <code>self.generate_MSA_context</code>. It masks <code>ancestor</code> (original sequence) and uses the sequences in <code>context</code> as context MSA.</p>
+<p>---&gt; Use this function with <code>simplified</code>=False only if you need tokens in cuda ! (i.e. if you want to compute embed
+     or contacs), otherwise use <code>simplified</code>=True</p>
+<p>The variable <code>self.iterations</code> must be a numpy array which specifies when (at which iterations)
+the tokens must be saved. The last element of the array gives the maximum number of iterations that should be done.
+If <code>print_all</code>=True then it saves the generated sequences at each iteration.</p>
+<p><code>ancestor</code>:     input sequence to be masked iteratively.</p>
+<p><code>context</code>:      context MSA (not masked).</p>
+<p><code>use_pdf</code>:      if it's True the function sample the token from the logits pdf 
+                instead of getting the argmax (greedy sampling).</p>
+<p><code>sample_all</code>:   if True all the new tokens are obtained from the logits (both
+                the masked and the non masked), if False the non masked tokens
+                are left untouched and only the masked ones are changed.</p>
+<p><code>T</code>:            Temperature of sampling from the pdf of output logits.</p>
+<p><code>depth</code>:        number of generated sequences, if None the depth is the number of ancestor sequences.</p>
+
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+    
+<div class="cell border-box-sizing code_cell rendered">
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+    
+<div class="cell border-box-sizing code_cell rendered">
+
+<div class="output_wrapper">
+<div class="output">
+
+<div class="output_area">
+
+
+<div class="output_markdown rendered_html output_subarea ">
+<h4 id="gen_MSAs" class="doc_header"><code>gen_MSAs</code><a href="__main__.py#L6" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>gen_MSAs</code>(<strong><code>filepath</code></strong>:"Path of the input directory", <strong><code>filename</code></strong>:"Name of the input file(s)", <strong><code>new_dir</code></strong>:"Name of the output directory", <strong><code>pdf</code></strong>:"Should I sample tokens from the pdf ? (bool)", <strong><code>T</code></strong>:"Which is the sampling Temperature from the pdf ? (only when <code>pdf</code> is True)", <strong><code>sample_all</code></strong>:"Should I sample all tokens or just the masked ones ? (True = sample all tokens)", <strong><code>Iters</code></strong>:"Number of total iterations to generate the new tokens", <strong><code>pmask</code></strong>:"Masking probability", <strong><code>num</code></strong>:"Size of the batches MSAs which the MSA-Transformer receives as input", <strong><code>depth</code></strong>:"Number of batches (of size num) that you want to generate", <strong><code>generate</code></strong>:"How should I generate sequences ? False (=Batch generation) or Linear with context (=linear-ran/linear-tot-ran), <code>-ran</code> means that the context MSA is sampled randomly (once) while <code>-tot-ran</code> means that it is sampled randomly each time.", <strong><code>print_all</code></strong>:"Should I print the MSA after each iteration ? (bool)", <strong><code>range_vals</code></strong>:"First and last index of the sequences that you want to use as ancestors", <strong><code>phylo_w</code></strong>:"Should I sample the starting sequences from the phylogeny weights ? (bool)")</p>
+</blockquote>
+<p>Generate a new MSA either with Batch generation of Context generation. It shuffles the initial MSA and uses different slices as batch MSAs</p>
+
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+    {% endraw %}
+
+<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h2 id="Build-library">Build library<a class="anchor-link" href="#Build-library"> </a></h2>
+</div>
+</div>
+</div>
+</div>
+ 
+
--- a/docs/feed.xml
+++ b/docs/feed.xml
+---
+search: exclude
+layout: none
+---
+
+<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
+    <channel>
+        <title>{{ site.title | xml_escape }}</title>
+        <description>{{ site.description | xml_escape }}</description>
+        <link>{{ site.url }}/</link>
+        <atom:link href="{{ "/feed.xml" | prepend: site.url }}" rel="self" type="application/rss+xml"/>
+        <pubDate>{{ site.time | date_to_rfc822 }}</pubDate>
+        <lastBuildDate>{{ site.time | date_to_rfc822 }}</lastBuildDate>
+        <generator>Jekyll v{{ jekyll.version }}</generator>
+        {% for post in site.posts limit:10 %}
+        <item>
+            <title>{{ post.title | xml_escape }}</title>
+            <description>{{ post.content | xml_escape }}</description>
+            <pubDate>{{ post.date | date_to_rfc822 }}</pubDate>
+            <link>{{ post.url | prepend: site.url }}</link>
+            <guid isPermaLink="true">{{ post.url | prepend: site.url }}</guid>
+            {% for tag in post.tags %}
+            <category>{{ tag | xml_escape }}</category>
+            {% endfor %}
+            {% for tag in page.tags %}
+            <category>{{ cat | xml_escape }}</category>
+            {% endfor %}
+        </item>
+        {% endfor %}
+    </channel>
+</rss>
--- a/docs/index.html
+++ b/docs/index.html
+---
+
+title: Iterative_masking
+
+
+keywords: fastai
+sidebar: home_sidebar
+
+summary: "Supporting repository for: "Generative power of a protein language model trained on multiple sequence alignments" (preprint: https://doi.org/10.1101/2022.04.14.488405). We use MSA Transformer (https://doi.org/10.1101/2021.02.12.430858) to generate synthetic protein sequences by masking iteratively the same MSA."
+description: "Supporting repository for: "Generative power of a protein language model trained on multiple sequence alignments" (preprint: https://doi.org/10.1101/2022.04.14.488405). We use MSA Transformer (https://doi.org/10.1101/2021.02.12.430858) to generate synthetic protein sequences by masking iteratively the same MSA."
+nb_path: "index.ipynb"
+---
+<!--
+
+#################################################
+### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
+#################################################
+# file to edit: index.ipynb
+# command to build the docs after a change: nbdev_build_docs
+
+-->
+
+<div class="container" id="notebook-container">
+        
+    {% raw %}
+    
+<div class="cell border-box-sizing code_cell rendered">
+
+</div>
+    {% endraw %}
+
+<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h2 id="Getting-started">Getting started<a class="anchor-link" href="#Getting-started"> </a></h2><p>Clone this repository on your local machine by running:</p>
+<div class="highlight"><pre><span></span>git clone git@github.com:Bitbol-Lab/Iterative_masking.git
+</pre></div>
+<p>and move inside the root folder.
+One can the use directly the functions from the cloned repository (in the folder <code>Iterative_masking</code>) or install it with an editable install running:</p>
+<div class="highlight"><pre><span></span>pip install -e .
+</pre></div>
+<p>We recommend creating and activating a dedicated <code>conda</code> or <code>virtualenv</code> Python virtual environment.</p>
+<h2 id="Requirements">Requirements<a class="anchor-link" href="#Requirements"> </a></h2><p>In order to use the functions, the following python packages are required:</p>
+<ul>
+<li>numpy</li>
+<li>scipy</li>
+<li>numba</li>
+<li>fastcore</li>
+<li>biopython</li>
+<li>esm==0.4.0</li>
+<li>pytorch</li>
+</ul>
+<p>It is also required to use a GPU (with cuda).</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h2 id="How-to-use">How to use<a class="anchor-link" href="#How-to-use"> </a></h2>
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p><a href="/Iterative_masking/core.html#IM_MSA_Transformer"><code>IM_MSA_Transformer</code></a>: Class with different functions used to generate new MSAs with the iterative masking procedure</p>
+<p><a href="/Iterative_masking/core.html#gen_MSAs"><code>gen_MSAs</code></a>: example function (with parser) that can be used to generate and save new sequences directly from the terminal.</p>
+
+</div>
+</div>
+</div>
+</div>
+ 
+
--- a/docs/sidebar.json
+++ b/docs/sidebar.json
+{
+  "Iterative_masking": {
+    "Overview": "/",
+    "Iterative_masking": "core.html"
+  }
+}
\ No newline at end of file
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
+---
+layout: none
+search: exclude
+---
+
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  {% for post in site.posts %}
+  {% unless post.search == "exclude" %}
+  <url>
+    <loc>{{site.url}}{{post.url}}</loc>
+  </url>
+  {% endunless %}
+  {% endfor %}
+
+
+  {% for page in site.pages %}
+  {% unless page.search == "exclude" %}
+  <url>
+    <loc>{{site.url}}{{ page.url}}</loc>
+  </url>
+  {% endunless %}
+  {% endfor %}
+</urlset>
\ No newline at end of file
--- a/esm-0.4.0/.gitignore
+++ b/esm-0.4.0/.gitignore
+# Created by .ignore support plugin (hsz.mobi)
+### Go template
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
+.idea
+/bin/
+/log/
+/pkg/
+*.iml
+/src
+
+
--- a/esm-0.4.0/.travis.yml
+++ b/esm-0.4.0/.travis.yml
+language: go
+os:
+  - osx
+go:
+  - 1.10
+
+install:
+  - go get github.com/mitchellh/gox
+
+script: make cross-gox-build-all-platform package-all-platform
+
+deploy:
+  provider: releases
+  skip_cleanup: true
+  api_key:
+    secure: E7POmQPK3yM4X3XI/WBSc9zFQ5hc7DuDr3ECOnXdkEbq5Uec0WcV9EZHmiMhPwd/dNKQ5/fVGdP+R9RsMtllMG29r6WWVTOdoZP/upWUn2rH7ZzpUuTCo2rXgKjFSw4iu8qzs0Rg7kMcvepKp8kvN1qcujri8sRz3K8ntHz3krEE2+aoNZQZBKHrRoKoQyBaouVQjVGIYI+Pafyyco0zByt1ijorWK04aCCx9xwoRLw6CS1RJVtk98I5EPDhOr0/madYYkVIQj5tpqv7BIUzT+cVLXfXWh9D2XrUuv1ui/1FvHXH/wuxkplWhMAGQaDRF9O86HtAHN+8EC/0VchTNQbVcBScw1gu3R4wPhU7xyfZQBa86TgV3CdfJGiSDkqIY1D7M+XF8f8XOJ6yaKA5Vjh83B3u7aeCwnLRf9appJsxY5YECqBua3sQJnxLT9wv8ObRd+7Yb8itXW6CN8iqcwnS2oIvKJPpJ99fcE4DPrPprONPiQiBse8378f8p6/YR0K+S3ai4XOSBO/qnZLHsKzVerwhIabMEjgpV7t0F+LegsmvToHkhgK0ArMyh9IHWgfwFLq1C/MOJGF5FLS09TrOmuNEeYrHheTrd2KLezUKX5ujWp0H8Rddks62baIJkYHYbBPYEQLWcmqG4kI6kAYBJC+5LGgoNLJUye/Mnyo=
+  file: 'bin/*.tar.gz'
+  file_glob: true
+  on:
+    repo: medcl/elasticsearch-migration
+    tags: true
--- a/esm-0.4.0/Makefile
+++ b/esm-0.4.0/Makefile
+SHELL=/bin/bash
+CWD=$(shell pwd)
+OLDGOPATH=${GOPATH}
+NEWGOPATH:=${CWD}:${OLDGOPATH}
+export GOPATH=$(NEWGOPATH)
+PATH := $(PATH):$(GOPATH)/bin
+
+
+build: clean config
+	go build  -o bin/esm
+
+tar: build
+	tar cfz bin/esm.tar.gz bin/esm
+
+cross-build-all-platform: clean config
+	go test
+	GOOS=windows GOARCH=amd64     go build -o bin/windows64/esm.exe
+	GOOS=windows GOARCH=386       go build -o bin/windows32/esm.exe
+	GOOS=darwin  GOARCH=amd64     go build -o bin/darwin64/esm
+	GOOS=darwin  GOARCH=386       go build -o bin/darwin32/esm
+	GOOS=linux  GOARCH=amd64      go build -o bin/linux64/esm
+	GOOS=linux  GOARCH=386        go build -o bin/linux32/esm
+	GOOS=linux  GOARCH=arm        go build -o bin/linux_arm/esm
+	GOOS=freebsd  GOARCH=amd64    go build -o bin/freebsd64/esm
+	GOOS=freebsd  GOARCH=386      go build -o bin/freebsd32/esm
+	GOOS=netbsd  GOARCH=amd64     go build -o bin/netbsd64/esm
+	GOOS=netbsd  GOARCH=386       go build -o bin/netbsd32/esm
+	GOOS=openbsd  GOARCH=amd64    go build -o bin/openbsd64/esm
+	GOOS=openbsd  GOARCH=386      go build -o bin/openbsd32/esm
+
+
+gox-cross-build-all-platform: clean config
+	go get github.com/mitchellh/gox
+	go test
+	gox -output="bin/esm_{{.OS}}_{{.Arch}}"
+
+cross-gox-build-all-platform: clean config
+	go get github.com/mitchellh/gox
+	go test
+	gox -os=windows -arch=amd64  -output="bin/windows64/esm"
+	gox -os=windows -arch=386       -output=bin/windows32/esm
+	gox -os=darwin  -arch=amd64     -output=bin/darwin64/esm
+	gox -os=darwin  -arch=386       -output=bin/darwin32/esm
+	gox -os=linux  -arch=amd64      -output=bin/linux64/esm
+	gox -os=linux  -arch=386        -output=bin/linux32/esm
+	gox -os=linux  -arch=arm        -output=bin/linux_arm/esm
+	gox -os=freebsd  -arch=amd64    -output=bin/freebsd64/esm
+	gox -os=freebsd  -arch=386      -output=bin/freebsd32/esm
+	gox -os=netbsd  -arch=amd64     -output=bin/netbsd64/esm
+	gox -os=netbsd  -arch=386       -output=bin/netbsd32/esm
+	gox -os=openbsd  -arch=amd64    -output=bin/openbsd64/esm
+	gox -os=openbsd  -arch=386      -output=bin/openbsd32/esm
+
+cross-build: clean config
+	go test
+	GOOS=windows GOARCH=amd64     go build -o bin/windows64/esm.exe
+	GOOS=darwin  GOARCH=amd64     go build -o bin/darwin64/esm
+	GOOS=linux  GOARCH=amd64      go build -o bin/linux64/esm
+
+all: clean config cross-build
+
+all-platform: clean config cross-build-all-platform
+
+format:
+	gofmt -s -w -tabs=false -tabwidth=4 main.go
+
+clean:
+	rm -rif bin
+	mkdir bin
+
+config:
+	@echo "get Dependencies"
+	go env
+	go get gopkg.in/cheggaaa/pb.v1
+	go get github.com/jessevdk/go-flags
+	go get github.com/olekukonko/ts
+	go get github.com/cihub/seelog
+	go get github.com/parnurzeal/gorequest
+
+dist: cross-build package
+
+dist-all: all package
+
+dist-all-platform: all-platform package-all-platform
+
+package:
+	@echo "Packaging"
+	tar cfz 	 bin/windows64.tar.gz    bin/windows64/esm.exe
+	tar cfz 	 bin/darwin64.tar.gz      bin/darwin64/esm
+	tar cfz 	 bin/linux64.tar.gz      bin/linux64/esm
+
+package-all-platform:
+	@echo "Packaging"
+	tar cfz 	 bin/windows64.tar.gz    bin/windows64/esm.exe
+	tar cfz 	 bin/windows32.tar.gz    bin/windows32/esm.exe
+	tar cfz 	 bin/darwin64.tar.gz      bin/darwin64/esm
+	tar cfz 	 bin/darwin32.tar.gz      bin/darwin32/esm
+	tar cfz 	 bin/linux64.tar.gz      bin/linux64/esm
+	tar cfz 	 bin/linux32.tar.gz      bin/linux32/esm
+	tar cfz 	 bin/linux_arm.tar.gz     bin/linux_arm/esm
+	tar cfz 	 bin/freebsd64.tar.gz    bin/freebsd64/esm
+	tar cfz 	 bin/freebsd32.tar.gz    bin/freebsd32/esm
+	tar cfz 	 bin/netbsd64.tar.gz     bin/netbsd64/esm
+	tar cfz 	 bin/netbsd32.tar.gz     bin/netbsd32/esm
+	tar cfz 	 bin/openbsd64.tar.gz     bin/openbsd64/esm
+	tar cfz 	 bin/openbsd32.tar.gz     bin/openbsd32/esm
+
+
+cross-compile:
+	@echo "Prepare Cross Compiling"
+	cd $(GOROOT)/src && GOOS=windows GOARCH=amd64 ./make.bash --no-clean
+	cd $(GOROOT)/src && GOOS=darwin  GOARCH=amd64 ./make.bash --no-clean 2> /dev/null 1> /dev/null
+	cd $(GOROOT)/src && GOOS=linux  GOARCH=amd64 ./make.bash --no-clean 2> /dev/null 1> /dev/null
+
+	cd $(CWD)
+
+
--- a/esm-0.4.0/README.md
+++ b/esm-0.4.0/README.md
+# An Elasticsearch Migration Tool
+
+Support cross version and http basic auth.
+
+[![asciicast](https://asciinema.org/a/e562wy1ro30yboznkj5f539md.png)](https://asciinema.org/a/e562wy1ro30yboznkj5f539md)
+
+## Features:
+
+*  Cross version migration supported
+
+*  Overwrite index name
+
+*  Copy index settings and mapping
+
+*  Support http basic auth
+
+*  Support dump into local file
+
+*  Support loading from local file
+
+*  Support http proxy
+
+*  Support sliced scroll (only for elasticsearch 5.0)
+
+
+## Example:
+
+copy index `index_name` from `192.168.1.x` to `192.168.1.y:9200`
+
+```
+./bin/esm  -s http://192.168.1.x:9200   -d http://192.168.1.y:9200 -x index_name  -w=5 -b=10 -c 10000
+```
+
+copy index `src_index` from `192.168.1.x` to `192.168.1.y:9200` and save with `dest_index`
+
+```
+./bin/esm -s http://localhost:9200 -d http://localhost:9200 -x src_index -y dest_index -w=5 -b=100
+```
+
+support Basic-Auth
+```
+./bin/esm -s http://localhost:9200 -x "src_index" -y "dest_index"  -d http://localhost:9201 -n admin:111111
+```
+
+copy settings and override shard size
+```
+./bin/esm -s http://localhost:9200 -x "src_index" -y "dest_index"  -d http://localhost:9201 -m admin:111111 -c 10000 --shards=50  --copy_settings
+
+```
+
+copy settings and mapping, recreate target index, add query to source fetch, refresh after migration
+```
+./bin/esm -s http://localhost:9200 -x "src_index" -q=query:phone -y "dest_index"  -d http://localhost:9201  -c 10000 --shards=5  --copy_settings --copy_mapping --force  --refresh
+
+```
+
+dump elasticsearch documents into local file
+```
+./bin/esm -s http://localhost:9200 -x "src_index"  -m admin:111111 -c 5000 -b -q=query:mixer  --refresh -o=dump.bin 
+```
+
+loading data from dump files, bulk insert to another es instance
+```
+./bin/esm -d http://localhost:9200 -y "dest_index"   -n admin:111111 -c 5000 -b 5 --refresh -i=dump.bin
+```
+
+support proxy
+```
+ ./bin/esm -d http://123345.ap-northeast-1.aws.found.io:9200 -y "dest_index"   -n admin:111111  -c 5000 -b 1 --refresh  -i dump.bin  --dest_proxy=http://127.0.0.1:9743
+```
+
+use sliced scroll(only available in elasticsearch v5) to speed scroll, and update shard number
+```
+ ./bin/esm -s=http://192.168.3.206:9200 -d=http://localhost:9200 -n=elastic:changeme -f --copy_settings --copy_mappings -x=bestbuykaggle  --sliced_scroll_size=5 --shards=50 --refresh
+```
+
+## Download
+https://github.com/medcl/elasticsearch-dump/releases
+
+
+## Compile:
+
+if download version is not fill you environment,you may try to compile it yourself. `go` required.
+
+`make build`
+
+
+## Options
+
+```
+  -s, --source=     source elasticsearch instance
+  -d, --dest=       destination elasticsearch instance
+  -q, --query=      query against source elasticsearch instance, filter data before migrate, ie: name:medcl
+  -m, --source_auth basic auth of source elasticsearch instance, ie: user:pass
+  -n, --dest_auth   basic auth of target elasticsearch instance, ie: user:pass
+  -c, --count=      number of documents at a time: ie "size" in the scroll request (10000)
+  --sliced_scroll_size=      size of sliced scroll, to make it work, the size should be > 1, default:"1"
+  -t, --time=       scroll time (1m)
+      --shards=     set a number of shards on newly created indexes
+      --copy_settings copy index settings from source
+      --copy_mappings copy mappings mappings from source
+  -f, --force      delete destination index before copying, default:false
+  -x, --src_indexes=    list of indexes to copy, comma separated (_all), support wildcard match(*)
+  -y, --dest_index=    indexes name to save, allow only one indexname, original indexname will be used if not specified
+  -a, --all         copy indexes starting with . and _ (false)
+  -w, --workers=    concurrency number for bulk workers, default is: "1"
+  -b  --bulk_size 	bulk size in MB" default:5
+  -v  --log 	    setting log level,options:trace,debug,info,warn,error
+  -i  --input_file  indexing from local dump file, file format: {"_id":"xxx","_index":"xxx","_source":{"xxx":"xxx"},"_type":"xxx"  }
+  -o  --output_file output documents of source index into local file, file format same as input_file.
+  --source_proxy     set proxy to source http connections, ie: http://127.0.0.1:8080
+  --dest_proxy       set proxy to destination http connections, ie: http://127.0.0.1:8080
+  --refresh          refresh after migration finished
+
+```
+
+Versions
+--------
+
+From       | To
+-----------|-----------
+1.x | 1.x
+1.x | 2.x
+1.x | 5.0
+2.x | 1.x
+2.x | 2.x
+2.x | 5.0
+5.0 | 1.x
+5.0 | 2.x
+5.0 | 5.0
+
--- a/esm-0.4.0/bulk.go
+++ b/esm-0.4.0/bulk.go
+/*
+Copyright 2016 Medcl (m AT medcl.net)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"sync"
+	log "github.com/cihub/seelog"
+	"encoding/json"
+	"bytes"
+	"gopkg.in/cheggaaa/pb.v1"
+	"time"
+)
+
+func (c *Migrator) NewBulkWorker(docCount *int, pb *pb.ProgressBar, wg *sync.WaitGroup) {
+
+	log.Debug("start es bulk worker")
+
+	bulkItemSize := 0
+	mainBuf := bytes.Buffer{}
+	docBuf := bytes.Buffer{}
+	docEnc := json.NewEncoder(&docBuf)
+
+	READ_DOCS:
+	for {
+		select {
+		case docI, open := <-c.DocChan:
+			var err error
+			log.Trace("read doc from channel,", docI)
+		// this check is in case the document is an error with scroll stuff
+			if status, ok := docI["status"]; ok {
+				if status.(int) == 404 {
+					log.Error("error: ", docI["response"])
+					continue
+				}
+			}
+
+		// sanity check
+			for _, key := range []string{"_index", "_type", "_source", "_id"} {
+				if _, ok := docI[key]; !ok {
+					//json,_:=json.Marshal(docI)
+					//log.Errorf("failed parsing document: %v", string(json))
+					break READ_DOCS
+				}
+			}
+
+			var tempDestIndexName string
+			tempDestIndexName = docI["_index"].(string)
+
+			if c.Config.TargetIndexName != "" {
+				tempDestIndexName = c.Config.TargetIndexName
+			}
+
+			doc := Document{
+				Index:  tempDestIndexName,
+				Type:   docI["_type"].(string),
+				source: docI["_source"].(map[string]interface{}),
+				Id:     docI["_id"].(string),
+			}
+
+		// if channel is closed flush and gtfo
+			if !open {
+				goto WORKER_DONE
+			}
+
+		// sanity check
+			if len(doc.Index) == 0 || len(doc.Id) == 0 || len(doc.Type) == 0 {
+				log.Errorf("failed decoding document: %+v", doc)
+				continue
+			}
+
+		// encode the doc and and the _source field for a bulk request
+			post := map[string]Document{
+				"create": doc,
+			}
+			if err = docEnc.Encode(post); err != nil {
+				log.Error(err)
+			}
+			if err = docEnc.Encode(doc.source); err != nil {
+				log.Error(err)
+			}
+
+		// if we approach the 100mb es limit, flush to es and reset mainBuf
+			if mainBuf.Len() + docBuf.Len() > (c.Config.BulkSizeInMB * 1000000) {
+				goto CLEAN_BUFFER
+			}
+
+		// append the doc to the main buffer
+			mainBuf.Write(docBuf.Bytes())
+		// reset for next document
+			bulkItemSize++
+			docBuf.Reset()
+			(*docCount)++
+		case <-time.After(time.Second * 5):
+			log.Debug("5s no message input")
+			goto CLEAN_BUFFER
+		case <-time.After(time.Minute * 5):
+			log.Warn("5m no message input, close worker")
+			goto WORKER_DONE
+		}
+
+		goto READ_DOCS
+
+		CLEAN_BUFFER:
+		c.TargetESAPI.Bulk(&mainBuf)
+		log.Trace("clean buffer, and execute bulk insert")
+		pb.Add(bulkItemSize)
+		bulkItemSize = 0
+
+	}
+	WORKER_DONE:
+	if docBuf.Len() > 0 {
+		mainBuf.Write(docBuf.Bytes())
+		bulkItemSize++
+	}
+	c.TargetESAPI.Bulk(&mainBuf)
+	log.Trace("bulk insert")
+	pb.Add(bulkItemSize)
+	bulkItemSize = 0
+	wg.Done()
+}
--- a/esm-0.4.0/domain.go
+++ b/esm-0.4.0/domain.go
+/*
+Copyright 2016 Medcl (m AT medcl.net)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import "sync"
+
+type Indexes map[string]interface{}
+
+type Document struct {
+	Index  string                 `json:"_index"`
+	Type   string                 `json:"_type"`
+	Id     string                 `json:"_id"`
+	source map[string]interface{} `json:"_source"`
+}
+
+type Scroll struct {
+	Took int `json:"took"`
+	ScrollId string `json:"_scroll_id"`
+	TimedOut bool   `json:"timed_out"`
+	Hits     struct {
+		     MaxScore float32    `json:"max_score"`
+		     Total int           `json:"total"`
+		     Docs  []interface{} `json:"hits"`
+	     } `json:"hits"`
+	Shards struct {
+		     Total int `json:"total"`
+		     Successful int `json:"successful"`
+		     Failed int `json:"failed"`
+		     Failures []struct {
+			     Shard int     `json:"shard"`
+			     Index string  `json:"index"`
+			     Status int    `json:"status"`
+			     Reason interface{} `json:"reason"`
+		     } `json:"failures"`
+	     } `json:"_shards"`
+}
+
+type ClusterVersion struct{
+	Name   string `json:"name"`
+	ClusterName   string `json:"cluster_name"`
+	Version     struct {
+			 Number string    `json:"number"`
+			 LuceneVersion string    `json:"lucene_version"`
+		 } `json:"version"`
+}
+
+type ClusterHealth struct {
+	Name   string `json:"cluster_name"`
+	Status string `json:"status"`
+}
+
+type Migrator struct{
+
+	FlushLock       sync.Mutex
+	DocChan         chan map[string]interface{}
+	SourceESAPI     ESAPI
+	TargetESAPI     ESAPI
+	SourceAuth      *Auth
+	TargetAuth      *Auth
+	Config 		*Config
+}
+
+
+type Config struct {
+
+	// config options
+	SourceEs        string `short:"s" long:"source"  description:"source elasticsearch instance, ie: http://localhost:9200"`
+	Query        string `short:"q" long:"query"  description:"query against source elasticsearch instance, filter data before migrate, ie: name:medcl"`
+	TargetEs        string `short:"d" long:"dest"    description:"destination elasticsearch instance, ie: http://localhost:9201"`
+	SourceEsAuthStr string `short:"m" long:"source_auth"  description:"basic auth of source elasticsearch instance, ie: user:pass"`
+	TargetEsAuthStr  string `short:"n" long:"dest_auth"  description:"basic auth of target elasticsearch instance, ie: user:pass"`
+	DocBufferCount    int    `short:"c" long:"count"   description:"number of documents at a time: ie \"size\" in the scroll request" default:"10000"`
+	Workers           int    `short:"w" long:"workers" description:"concurrency number for bulk workers" default:"1"`
+	BulkSizeInMB      int    `short:"b" long:"bulk_size" description:"bulk size in MB" default:"5"`
+	ScrollTime        string `short:"t" long:"time"    description:"scroll time" default:"1m"`
+	ScrollSliceSize   int    `long:"sliced_scroll_size"    description:"size of sliced scroll, to make it work, the size should be > 1" default:"1"`
+	RecreateIndex     bool      `short:"f" long:"force"   description:"delete destination index before copying" default:"false"`
+	CopyAllIndexes    bool   `short:"a" long:"all"     description:"copy indexes starting with . and _"`
+	CopyIndexSettings bool   `long:"copy_settings"          description:"copy index settings from source" default:"false"`
+	CopyIndexMappings bool   `long:"copy_mappings"          description:"copy index mappings from source" default:"false"`
+	ShardsCount       int    `long:"shards"            description:"set a number of shards on newly created indexes"`
+	SourceIndexNames  string `short:"x" long:"src_indexes" description:"indexes name to copy,support regex and comma separated list" default:"_all"`
+	TargetIndexName   string `short:"y" long:"dest_index" description:"indexes name to save, allow only one indexname, original indexname will be used if not specified" default:""`
+	WaitForGreen      bool   `long:"green"             description:"wait for both hosts cluster status to be green before dump. otherwise yellow is okay"`
+	LogLevel          string `short:"v" long:"log"            description:"setting log level,options:trace,debug,info,warn,error"  default:"INFO"`
+	DumpOutFile       string  `short:"o" long:"output_file"            description:"output documents of source index into local file" `
+	DumpInputFile     string  `short:"i" long:"input_file"            description:"indexing from local dump file" `
+	SourceProxy       string    `long:"source_proxy"            description:"set proxy to source http connections, ie: http://127.0.0.1:8080"`
+	TargetProxy       string    `long:"dest_proxy"            description:"set proxy to target http connections, ie: http://127.0.0.1:8080"`
+	Refresh           bool      `long:"refresh"                 description:"refresh after migration finished"`
+	Fields            string `long:"fields"                 description:"output fields, comma separated, ie: col1,col2,col3,..." `
+
+}
+
+type Auth struct {
+	User string
+	Pass string
+}
--- a/esm-0.4.0/esapi.go
+++ b/esm-0.4.0/esapi.go
+/*
+Copyright 2016 Medcl (m AT medcl.net)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import "bytes"
+
+type ESAPI interface{
+	ClusterHealth() *ClusterHealth
+	Bulk(data *bytes.Buffer)
+	GetIndexSettings(indexNames string) (*Indexes, error)
+	DeleteIndex(name string) (error)
+	CreateIndex(name string,settings map[string]interface{}) (error)
+	GetIndexMappings(copyAllIndexes bool,indexNames string)(string,int,*Indexes,error)
+	UpdateIndexSettings(indexName string,settings map[string]interface{})(error)
+	UpdateIndexMapping(indexName string,mappings map[string]interface{})(error)
+	NewScroll(indexNames string,scrollTime string,docBufferCount int,query string, slicedId,maxSlicedCount int, fields string)(*Scroll, error)
+	NextScroll(scrollTime string,scrollId string)(*Scroll,error)
+	Refresh(name string) (err error)
+}
--- a/esm-0.4.0/file.go
+++ b/esm-0.4.0/file.go
+/*
+Copyright 2016 Medcl (m AT medcl.net)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"sync"
+	"gopkg.in/cheggaaa/pb.v1"
+	log "github.com/cihub/seelog"
+	"os"
+	"bufio"
+	"encoding/json"
+	"io"
+)
+
+func checkFileIsExist(filename string) (bool) {
+	var exist = true;
+	if _, err := os.Stat(filename); os.IsNotExist(err) {
+		exist = false;
+	}
+	return exist;
+}
+
+func (m *Migrator) NewFileReadWorker(pb *pb.ProgressBar, wg *sync.WaitGroup)  {
+	log.Debug("start reading file")
+	f, err := os.Open(m.Config.DumpInputFile)
+	if err != nil {
+		log.Error(err)
+		return
+	}
+
+	defer f.Close()
+	r := bufio.NewReader(f)
+	lineCount := 0
+	for{
+		line,err := r.ReadString('\n')
+		if io.EOF == err || nil != err{
+			break
+		}
+		lineCount += 1
+		js := map[string]interface{}{}
+
+		//log.Trace("reading file,",lineCount,",", line)
+		err = json.Unmarshal([]byte(line), &js)
+		if(err!=nil){
+			log.Error(err)
+			continue
+		}
+		m.DocChan <- js
+		pb.Increment()
+	}
+
+	defer f.Close()
+	log.Debug("end reading file")
+	close(m.DocChan)
+	wg.Done()
+}
+
+func (c *Migrator) NewFileDumpWorker(pb *pb.ProgressBar, wg *sync.WaitGroup) {
+	var f *os.File
+	var err1   error;
+
+	if checkFileIsExist(c.Config.DumpOutFile) {
+		f, err1 = os.OpenFile(c.Config.DumpOutFile, os.O_APPEND|os.O_WRONLY, os.ModeAppend)
+		if(err1!=nil){
+			log.Error(err1)
+			return
+		}
+
+	}else {
+		f, err1 = os.Create(c.Config.DumpOutFile)
+		if(err1!=nil){
+			log.Error(err1)
+			return
+		}
+	}
+
+	w := bufio.NewWriter(f)
+
+	READ_DOCS:
+	for {
+		docI, open := <-c.DocChan
+
+		// this check is in case the document is an error with scroll stuff
+		if status, ok := docI["status"]; ok {
+			if status.(int) == 404 {
+				log.Error("error: ", docI["response"])
+				continue
+			}
+		}
+
+		// sanity check
+		for _, key := range []string{"_index", "_type", "_source", "_id"} {
+			if _, ok := docI[key]; !ok {
+				//json,_:=json.Marshal(docI)
+				//log.Errorf("failed parsing document: %v", string(json))
+				break READ_DOCS
+			}
+		}
+
+		jsr,err:=json.Marshal(docI)
+		log.Trace(string(jsr))
+		if(err!=nil){
+			log.Error(err)
+		}
+		n,err:=w.WriteString(string(jsr))
+		if(err!=nil){
+			log.Error(n,err)
+		}
+		w.WriteString("\n")
+		pb.Increment()
+
+		// if channel is closed flush and gtfo
+		if !open {
+			goto WORKER_DONE
+		}
+	}
+
+	WORKER_DONE:
+	w.Flush()
+	f.Close()
+
+	wg.Done()
+	log.Debug("file dump finished")
+}
+
+
--- a/esm-0.4.0/go.mod
+++ b/esm-0.4.0/go.mod
+module esm-0.4.0
+
+go 1.20