README.md 1.43 KB
Newer Older
wangsen's avatar
wangsen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

# 数据集






# 环境搭建
```
git clone    https://github.com/HazyResearch/hyena-dna.git  && cd hyena-dna
pip install -r requirements.txt  -i https://pypi.tuna.tsinghua.edu.cn/simple



```


```
numpy
scipy
pandas
scikit-learn
matplotlib
tqdm
rich
pytorch-lightning==1.9.4
hydra-core
omegaconf
wandb
einops
opt_einsum
cmake # For pykeops support
# pykeops # Only for S4D. If there are installation problems with pykeops==2.x, try pykeops==1.5
transformers==4.26.1 # For some schedulers and tokenizers
#torchvision
timm==0.9.16
prettytable
numerize
git-lfs

# Dataset specific packages
torchtext==0.16.0 # this needs to align with the pytorch version
#torchtext # this needs to align with the pytorch version
datasets # LRA

# genomic specific
pyfaidx
polars
genomic-benchmarks
loguru
liftover
```


# 训练


数据集

```
data
|-- hg38/
    |-- hg38.ml.fa
    |-- human-sequences.bed

mkdir -p data/hg38/
curl https://storage.googleapis.com/basenji_barnyard2/hg38.ml.fa.gz > data/hg38/hg38.ml.fa.gz
curl https://storage.googleapis.com/basenji_barnyard2/sequences_human.bed > data/hg38/human-sequences.bed
cd  data/hg38/  gzip -d hg38.ml.fa.gz


```
预训练

```
cd ../../
python -m train wandb=null experiment=hg38/hg38_hyena model.d_model=128 model.n_layer=2 dataset.batch_size=256 train.global_batch_size=256 dataset.max_length=1024 optimizer.lr=6e-4 trainer.devices=1
```





# 推理




# 参考链接

https://github.com/HazyResearch/hyena-dna.git