Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Retrieval-based-Voice-Conversion-WebUI_pytorch
Commits
9867304a
Commit
9867304a
authored
Jul 23, 2024
by
chenzk
Browse files
v1.0
parents
Pipeline
#1408
canceled with stages
Changes
217
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3679 additions
and
0 deletions
+3679
-0
infer/lib/uvr5_pack/lib_v5/nets_123812KB.py
infer/lib/uvr5_pack/lib_v5/nets_123812KB.py
+122
-0
infer/lib/uvr5_pack/lib_v5/nets_123821KB.py
infer/lib/uvr5_pack/lib_v5/nets_123821KB.py
+122
-0
infer/lib/uvr5_pack/lib_v5/nets_33966KB.py
infer/lib/uvr5_pack/lib_v5/nets_33966KB.py
+122
-0
infer/lib/uvr5_pack/lib_v5/nets_537227KB.py
infer/lib/uvr5_pack/lib_v5/nets_537227KB.py
+123
-0
infer/lib/uvr5_pack/lib_v5/nets_537238KB.py
infer/lib/uvr5_pack/lib_v5/nets_537238KB.py
+123
-0
infer/lib/uvr5_pack/lib_v5/nets_61968KB.py
infer/lib/uvr5_pack/lib_v5/nets_61968KB.py
+122
-0
infer/lib/uvr5_pack/lib_v5/nets_new.py
infer/lib/uvr5_pack/lib_v5/nets_new.py
+133
-0
infer/lib/uvr5_pack/lib_v5/spec_utils.py
infer/lib/uvr5_pack/lib_v5/spec_utils.py
+676
-0
infer/lib/uvr5_pack/name_params.json
infer/lib/uvr5_pack/name_params.json
+264
-0
infer/lib/uvr5_pack/utils.py
infer/lib/uvr5_pack/utils.py
+121
-0
infer/modules/ipex/__init__.py
infer/modules/ipex/__init__.py
+190
-0
infer/modules/ipex/attention.py
infer/modules/ipex/attention.py
+218
-0
infer/modules/ipex/gradscaler.py
infer/modules/ipex/gradscaler.py
+187
-0
infer/modules/ipex/hijacks.py
infer/modules/ipex/hijacks.py
+365
-0
infer/modules/onnx/export.py
infer/modules/onnx/export.py
+52
-0
infer/modules/train/extract/extract_f0_print.py
infer/modules/train/extract/extract_f0_print.py
+175
-0
infer/modules/train/extract/extract_f0_rmvpe.py
infer/modules/train/extract/extract_f0_rmvpe.py
+141
-0
infer/modules/train/extract/extract_f0_rmvpe_dml.py
infer/modules/train/extract/extract_f0_rmvpe_dml.py
+139
-0
infer/modules/train/extract_feature_print.py
infer/modules/train/extract_feature_print.py
+142
-0
infer/modules/train/preprocess.py
infer/modules/train/preprocess.py
+142
-0
No files found.
infer/lib/uvr5_pack/lib_v5/nets_123812KB.py
0 → 100644
View file @
9867304a
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
.
import
layers_123821KB
as
layers
class
BaseASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
nin
,
ch
,
dilations
=
(
4
,
8
,
16
)):
super
(
BaseASPPNet
,
self
).
__init__
()
self
.
enc1
=
layers
.
Encoder
(
nin
,
ch
,
3
,
2
,
1
)
self
.
enc2
=
layers
.
Encoder
(
ch
,
ch
*
2
,
3
,
2
,
1
)
self
.
enc3
=
layers
.
Encoder
(
ch
*
2
,
ch
*
4
,
3
,
2
,
1
)
self
.
enc4
=
layers
.
Encoder
(
ch
*
4
,
ch
*
8
,
3
,
2
,
1
)
self
.
aspp
=
layers
.
ASPPModule
(
ch
*
8
,
ch
*
16
,
dilations
)
self
.
dec4
=
layers
.
Decoder
(
ch
*
(
8
+
16
),
ch
*
8
,
3
,
1
,
1
)
self
.
dec3
=
layers
.
Decoder
(
ch
*
(
4
+
8
),
ch
*
4
,
3
,
1
,
1
)
self
.
dec2
=
layers
.
Decoder
(
ch
*
(
2
+
4
),
ch
*
2
,
3
,
1
,
1
)
self
.
dec1
=
layers
.
Decoder
(
ch
*
(
1
+
2
),
ch
,
3
,
1
,
1
)
def
__call__
(
self
,
x
):
h
,
e1
=
self
.
enc1
(
x
)
h
,
e2
=
self
.
enc2
(
h
)
h
,
e3
=
self
.
enc3
(
h
)
h
,
e4
=
self
.
enc4
(
h
)
h
=
self
.
aspp
(
h
)
h
=
self
.
dec4
(
h
,
e4
)
h
=
self
.
dec3
(
h
,
e3
)
h
=
self
.
dec2
(
h
,
e2
)
h
=
self
.
dec1
(
h
,
e1
)
return
h
class
CascadedASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
n_fft
):
super
(
CascadedASPPNet
,
self
).
__init__
()
self
.
stg1_low_band_net
=
BaseASPPNet
(
2
,
32
)
self
.
stg1_high_band_net
=
BaseASPPNet
(
2
,
32
)
self
.
stg2_bridge
=
layers
.
Conv2DBNActiv
(
34
,
16
,
1
,
1
,
0
)
self
.
stg2_full_band_net
=
BaseASPPNet
(
16
,
32
)
self
.
stg3_bridge
=
layers
.
Conv2DBNActiv
(
66
,
32
,
1
,
1
,
0
)
self
.
stg3_full_band_net
=
BaseASPPNet
(
32
,
64
)
self
.
out
=
nn
.
Conv2d
(
64
,
2
,
1
,
bias
=
False
)
self
.
aux1_out
=
nn
.
Conv2d
(
32
,
2
,
1
,
bias
=
False
)
self
.
aux2_out
=
nn
.
Conv2d
(
32
,
2
,
1
,
bias
=
False
)
self
.
max_bin
=
n_fft
//
2
self
.
output_bin
=
n_fft
//
2
+
1
self
.
offset
=
128
def
forward
(
self
,
x
,
aggressiveness
=
None
):
mix
=
x
.
detach
()
x
=
x
.
clone
()
x
=
x
[:,
:,
:
self
.
max_bin
]
bandw
=
x
.
size
()[
2
]
//
2
aux1
=
torch
.
cat
(
[
self
.
stg1_low_band_net
(
x
[:,
:,
:
bandw
]),
self
.
stg1_high_band_net
(
x
[:,
:,
bandw
:]),
],
dim
=
2
,
)
h
=
torch
.
cat
([
x
,
aux1
],
dim
=
1
)
aux2
=
self
.
stg2_full_band_net
(
self
.
stg2_bridge
(
h
))
h
=
torch
.
cat
([
x
,
aux1
,
aux2
],
dim
=
1
)
h
=
self
.
stg3_full_band_net
(
self
.
stg3_bridge
(
h
))
mask
=
torch
.
sigmoid
(
self
.
out
(
h
))
mask
=
F
.
pad
(
input
=
mask
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
mask
.
size
()[
2
]),
mode
=
"replicate"
,
)
if
self
.
training
:
aux1
=
torch
.
sigmoid
(
self
.
aux1_out
(
aux1
))
aux1
=
F
.
pad
(
input
=
aux1
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux1
.
size
()[
2
]),
mode
=
"replicate"
,
)
aux2
=
torch
.
sigmoid
(
self
.
aux2_out
(
aux2
))
aux2
=
F
.
pad
(
input
=
aux2
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux2
.
size
()[
2
]),
mode
=
"replicate"
,
)
return
mask
*
mix
,
aux1
*
mix
,
aux2
*
mix
else
:
if
aggressiveness
:
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]]
=
torch
.
pow
(
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]],
1
+
aggressiveness
[
"value"
]
/
3
,
)
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:]
=
torch
.
pow
(
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:],
1
+
aggressiveness
[
"value"
],
)
return
mask
*
mix
def
predict
(
self
,
x_mag
,
aggressiveness
=
None
):
h
=
self
.
forward
(
x_mag
,
aggressiveness
)
if
self
.
offset
>
0
:
h
=
h
[:,
:,
:,
self
.
offset
:
-
self
.
offset
]
assert
h
.
size
()[
3
]
>
0
return
h
infer/lib/uvr5_pack/lib_v5/nets_123821KB.py
0 → 100644
View file @
9867304a
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
.
import
layers_123821KB
as
layers
class
BaseASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
nin
,
ch
,
dilations
=
(
4
,
8
,
16
)):
super
(
BaseASPPNet
,
self
).
__init__
()
self
.
enc1
=
layers
.
Encoder
(
nin
,
ch
,
3
,
2
,
1
)
self
.
enc2
=
layers
.
Encoder
(
ch
,
ch
*
2
,
3
,
2
,
1
)
self
.
enc3
=
layers
.
Encoder
(
ch
*
2
,
ch
*
4
,
3
,
2
,
1
)
self
.
enc4
=
layers
.
Encoder
(
ch
*
4
,
ch
*
8
,
3
,
2
,
1
)
self
.
aspp
=
layers
.
ASPPModule
(
ch
*
8
,
ch
*
16
,
dilations
)
self
.
dec4
=
layers
.
Decoder
(
ch
*
(
8
+
16
),
ch
*
8
,
3
,
1
,
1
)
self
.
dec3
=
layers
.
Decoder
(
ch
*
(
4
+
8
),
ch
*
4
,
3
,
1
,
1
)
self
.
dec2
=
layers
.
Decoder
(
ch
*
(
2
+
4
),
ch
*
2
,
3
,
1
,
1
)
self
.
dec1
=
layers
.
Decoder
(
ch
*
(
1
+
2
),
ch
,
3
,
1
,
1
)
def
__call__
(
self
,
x
):
h
,
e1
=
self
.
enc1
(
x
)
h
,
e2
=
self
.
enc2
(
h
)
h
,
e3
=
self
.
enc3
(
h
)
h
,
e4
=
self
.
enc4
(
h
)
h
=
self
.
aspp
(
h
)
h
=
self
.
dec4
(
h
,
e4
)
h
=
self
.
dec3
(
h
,
e3
)
h
=
self
.
dec2
(
h
,
e2
)
h
=
self
.
dec1
(
h
,
e1
)
return
h
class
CascadedASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
n_fft
):
super
(
CascadedASPPNet
,
self
).
__init__
()
self
.
stg1_low_band_net
=
BaseASPPNet
(
2
,
32
)
self
.
stg1_high_band_net
=
BaseASPPNet
(
2
,
32
)
self
.
stg2_bridge
=
layers
.
Conv2DBNActiv
(
34
,
16
,
1
,
1
,
0
)
self
.
stg2_full_band_net
=
BaseASPPNet
(
16
,
32
)
self
.
stg3_bridge
=
layers
.
Conv2DBNActiv
(
66
,
32
,
1
,
1
,
0
)
self
.
stg3_full_band_net
=
BaseASPPNet
(
32
,
64
)
self
.
out
=
nn
.
Conv2d
(
64
,
2
,
1
,
bias
=
False
)
self
.
aux1_out
=
nn
.
Conv2d
(
32
,
2
,
1
,
bias
=
False
)
self
.
aux2_out
=
nn
.
Conv2d
(
32
,
2
,
1
,
bias
=
False
)
self
.
max_bin
=
n_fft
//
2
self
.
output_bin
=
n_fft
//
2
+
1
self
.
offset
=
128
def
forward
(
self
,
x
,
aggressiveness
=
None
):
mix
=
x
.
detach
()
x
=
x
.
clone
()
x
=
x
[:,
:,
:
self
.
max_bin
]
bandw
=
x
.
size
()[
2
]
//
2
aux1
=
torch
.
cat
(
[
self
.
stg1_low_band_net
(
x
[:,
:,
:
bandw
]),
self
.
stg1_high_band_net
(
x
[:,
:,
bandw
:]),
],
dim
=
2
,
)
h
=
torch
.
cat
([
x
,
aux1
],
dim
=
1
)
aux2
=
self
.
stg2_full_band_net
(
self
.
stg2_bridge
(
h
))
h
=
torch
.
cat
([
x
,
aux1
,
aux2
],
dim
=
1
)
h
=
self
.
stg3_full_band_net
(
self
.
stg3_bridge
(
h
))
mask
=
torch
.
sigmoid
(
self
.
out
(
h
))
mask
=
F
.
pad
(
input
=
mask
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
mask
.
size
()[
2
]),
mode
=
"replicate"
,
)
if
self
.
training
:
aux1
=
torch
.
sigmoid
(
self
.
aux1_out
(
aux1
))
aux1
=
F
.
pad
(
input
=
aux1
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux1
.
size
()[
2
]),
mode
=
"replicate"
,
)
aux2
=
torch
.
sigmoid
(
self
.
aux2_out
(
aux2
))
aux2
=
F
.
pad
(
input
=
aux2
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux2
.
size
()[
2
]),
mode
=
"replicate"
,
)
return
mask
*
mix
,
aux1
*
mix
,
aux2
*
mix
else
:
if
aggressiveness
:
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]]
=
torch
.
pow
(
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]],
1
+
aggressiveness
[
"value"
]
/
3
,
)
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:]
=
torch
.
pow
(
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:],
1
+
aggressiveness
[
"value"
],
)
return
mask
*
mix
def
predict
(
self
,
x_mag
,
aggressiveness
=
None
):
h
=
self
.
forward
(
x_mag
,
aggressiveness
)
if
self
.
offset
>
0
:
h
=
h
[:,
:,
:,
self
.
offset
:
-
self
.
offset
]
assert
h
.
size
()[
3
]
>
0
return
h
infer/lib/uvr5_pack/lib_v5/nets_33966KB.py
0 → 100644
View file @
9867304a
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
.
import
layers_33966KB
as
layers
class
BaseASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
nin
,
ch
,
dilations
=
(
4
,
8
,
16
,
32
)):
super
(
BaseASPPNet
,
self
).
__init__
()
self
.
enc1
=
layers
.
Encoder
(
nin
,
ch
,
3
,
2
,
1
)
self
.
enc2
=
layers
.
Encoder
(
ch
,
ch
*
2
,
3
,
2
,
1
)
self
.
enc3
=
layers
.
Encoder
(
ch
*
2
,
ch
*
4
,
3
,
2
,
1
)
self
.
enc4
=
layers
.
Encoder
(
ch
*
4
,
ch
*
8
,
3
,
2
,
1
)
self
.
aspp
=
layers
.
ASPPModule
(
ch
*
8
,
ch
*
16
,
dilations
)
self
.
dec4
=
layers
.
Decoder
(
ch
*
(
8
+
16
),
ch
*
8
,
3
,
1
,
1
)
self
.
dec3
=
layers
.
Decoder
(
ch
*
(
4
+
8
),
ch
*
4
,
3
,
1
,
1
)
self
.
dec2
=
layers
.
Decoder
(
ch
*
(
2
+
4
),
ch
*
2
,
3
,
1
,
1
)
self
.
dec1
=
layers
.
Decoder
(
ch
*
(
1
+
2
),
ch
,
3
,
1
,
1
)
def
__call__
(
self
,
x
):
h
,
e1
=
self
.
enc1
(
x
)
h
,
e2
=
self
.
enc2
(
h
)
h
,
e3
=
self
.
enc3
(
h
)
h
,
e4
=
self
.
enc4
(
h
)
h
=
self
.
aspp
(
h
)
h
=
self
.
dec4
(
h
,
e4
)
h
=
self
.
dec3
(
h
,
e3
)
h
=
self
.
dec2
(
h
,
e2
)
h
=
self
.
dec1
(
h
,
e1
)
return
h
class
CascadedASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
n_fft
):
super
(
CascadedASPPNet
,
self
).
__init__
()
self
.
stg1_low_band_net
=
BaseASPPNet
(
2
,
16
)
self
.
stg1_high_band_net
=
BaseASPPNet
(
2
,
16
)
self
.
stg2_bridge
=
layers
.
Conv2DBNActiv
(
18
,
8
,
1
,
1
,
0
)
self
.
stg2_full_band_net
=
BaseASPPNet
(
8
,
16
)
self
.
stg3_bridge
=
layers
.
Conv2DBNActiv
(
34
,
16
,
1
,
1
,
0
)
self
.
stg3_full_band_net
=
BaseASPPNet
(
16
,
32
)
self
.
out
=
nn
.
Conv2d
(
32
,
2
,
1
,
bias
=
False
)
self
.
aux1_out
=
nn
.
Conv2d
(
16
,
2
,
1
,
bias
=
False
)
self
.
aux2_out
=
nn
.
Conv2d
(
16
,
2
,
1
,
bias
=
False
)
self
.
max_bin
=
n_fft
//
2
self
.
output_bin
=
n_fft
//
2
+
1
self
.
offset
=
128
def
forward
(
self
,
x
,
aggressiveness
=
None
):
mix
=
x
.
detach
()
x
=
x
.
clone
()
x
=
x
[:,
:,
:
self
.
max_bin
]
bandw
=
x
.
size
()[
2
]
//
2
aux1
=
torch
.
cat
(
[
self
.
stg1_low_band_net
(
x
[:,
:,
:
bandw
]),
self
.
stg1_high_band_net
(
x
[:,
:,
bandw
:]),
],
dim
=
2
,
)
h
=
torch
.
cat
([
x
,
aux1
],
dim
=
1
)
aux2
=
self
.
stg2_full_band_net
(
self
.
stg2_bridge
(
h
))
h
=
torch
.
cat
([
x
,
aux1
,
aux2
],
dim
=
1
)
h
=
self
.
stg3_full_band_net
(
self
.
stg3_bridge
(
h
))
mask
=
torch
.
sigmoid
(
self
.
out
(
h
))
mask
=
F
.
pad
(
input
=
mask
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
mask
.
size
()[
2
]),
mode
=
"replicate"
,
)
if
self
.
training
:
aux1
=
torch
.
sigmoid
(
self
.
aux1_out
(
aux1
))
aux1
=
F
.
pad
(
input
=
aux1
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux1
.
size
()[
2
]),
mode
=
"replicate"
,
)
aux2
=
torch
.
sigmoid
(
self
.
aux2_out
(
aux2
))
aux2
=
F
.
pad
(
input
=
aux2
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux2
.
size
()[
2
]),
mode
=
"replicate"
,
)
return
mask
*
mix
,
aux1
*
mix
,
aux2
*
mix
else
:
if
aggressiveness
:
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]]
=
torch
.
pow
(
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]],
1
+
aggressiveness
[
"value"
]
/
3
,
)
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:]
=
torch
.
pow
(
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:],
1
+
aggressiveness
[
"value"
],
)
return
mask
*
mix
def
predict
(
self
,
x_mag
,
aggressiveness
=
None
):
h
=
self
.
forward
(
x_mag
,
aggressiveness
)
if
self
.
offset
>
0
:
h
=
h
[:,
:,
:,
self
.
offset
:
-
self
.
offset
]
assert
h
.
size
()[
3
]
>
0
return
h
infer/lib/uvr5_pack/lib_v5/nets_537227KB.py
0 → 100644
View file @
9867304a
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
.
import
layers_537238KB
as
layers
class
BaseASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
nin
,
ch
,
dilations
=
(
4
,
8
,
16
)):
super
(
BaseASPPNet
,
self
).
__init__
()
self
.
enc1
=
layers
.
Encoder
(
nin
,
ch
,
3
,
2
,
1
)
self
.
enc2
=
layers
.
Encoder
(
ch
,
ch
*
2
,
3
,
2
,
1
)
self
.
enc3
=
layers
.
Encoder
(
ch
*
2
,
ch
*
4
,
3
,
2
,
1
)
self
.
enc4
=
layers
.
Encoder
(
ch
*
4
,
ch
*
8
,
3
,
2
,
1
)
self
.
aspp
=
layers
.
ASPPModule
(
ch
*
8
,
ch
*
16
,
dilations
)
self
.
dec4
=
layers
.
Decoder
(
ch
*
(
8
+
16
),
ch
*
8
,
3
,
1
,
1
)
self
.
dec3
=
layers
.
Decoder
(
ch
*
(
4
+
8
),
ch
*
4
,
3
,
1
,
1
)
self
.
dec2
=
layers
.
Decoder
(
ch
*
(
2
+
4
),
ch
*
2
,
3
,
1
,
1
)
self
.
dec1
=
layers
.
Decoder
(
ch
*
(
1
+
2
),
ch
,
3
,
1
,
1
)
def
__call__
(
self
,
x
):
h
,
e1
=
self
.
enc1
(
x
)
h
,
e2
=
self
.
enc2
(
h
)
h
,
e3
=
self
.
enc3
(
h
)
h
,
e4
=
self
.
enc4
(
h
)
h
=
self
.
aspp
(
h
)
h
=
self
.
dec4
(
h
,
e4
)
h
=
self
.
dec3
(
h
,
e3
)
h
=
self
.
dec2
(
h
,
e2
)
h
=
self
.
dec1
(
h
,
e1
)
return
h
class
CascadedASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
n_fft
):
super
(
CascadedASPPNet
,
self
).
__init__
()
self
.
stg1_low_band_net
=
BaseASPPNet
(
2
,
64
)
self
.
stg1_high_band_net
=
BaseASPPNet
(
2
,
64
)
self
.
stg2_bridge
=
layers
.
Conv2DBNActiv
(
66
,
32
,
1
,
1
,
0
)
self
.
stg2_full_band_net
=
BaseASPPNet
(
32
,
64
)
self
.
stg3_bridge
=
layers
.
Conv2DBNActiv
(
130
,
64
,
1
,
1
,
0
)
self
.
stg3_full_band_net
=
BaseASPPNet
(
64
,
128
)
self
.
out
=
nn
.
Conv2d
(
128
,
2
,
1
,
bias
=
False
)
self
.
aux1_out
=
nn
.
Conv2d
(
64
,
2
,
1
,
bias
=
False
)
self
.
aux2_out
=
nn
.
Conv2d
(
64
,
2
,
1
,
bias
=
False
)
self
.
max_bin
=
n_fft
//
2
self
.
output_bin
=
n_fft
//
2
+
1
self
.
offset
=
128
def
forward
(
self
,
x
,
aggressiveness
=
None
):
mix
=
x
.
detach
()
x
=
x
.
clone
()
x
=
x
[:,
:,
:
self
.
max_bin
]
bandw
=
x
.
size
()[
2
]
//
2
aux1
=
torch
.
cat
(
[
self
.
stg1_low_band_net
(
x
[:,
:,
:
bandw
]),
self
.
stg1_high_band_net
(
x
[:,
:,
bandw
:]),
],
dim
=
2
,
)
h
=
torch
.
cat
([
x
,
aux1
],
dim
=
1
)
aux2
=
self
.
stg2_full_band_net
(
self
.
stg2_bridge
(
h
))
h
=
torch
.
cat
([
x
,
aux1
,
aux2
],
dim
=
1
)
h
=
self
.
stg3_full_band_net
(
self
.
stg3_bridge
(
h
))
mask
=
torch
.
sigmoid
(
self
.
out
(
h
))
mask
=
F
.
pad
(
input
=
mask
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
mask
.
size
()[
2
]),
mode
=
"replicate"
,
)
if
self
.
training
:
aux1
=
torch
.
sigmoid
(
self
.
aux1_out
(
aux1
))
aux1
=
F
.
pad
(
input
=
aux1
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux1
.
size
()[
2
]),
mode
=
"replicate"
,
)
aux2
=
torch
.
sigmoid
(
self
.
aux2_out
(
aux2
))
aux2
=
F
.
pad
(
input
=
aux2
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux2
.
size
()[
2
]),
mode
=
"replicate"
,
)
return
mask
*
mix
,
aux1
*
mix
,
aux2
*
mix
else
:
if
aggressiveness
:
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]]
=
torch
.
pow
(
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]],
1
+
aggressiveness
[
"value"
]
/
3
,
)
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:]
=
torch
.
pow
(
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:],
1
+
aggressiveness
[
"value"
],
)
return
mask
*
mix
def
predict
(
self
,
x_mag
,
aggressiveness
=
None
):
h
=
self
.
forward
(
x_mag
,
aggressiveness
)
if
self
.
offset
>
0
:
h
=
h
[:,
:,
:,
self
.
offset
:
-
self
.
offset
]
assert
h
.
size
()[
3
]
>
0
return
h
infer/lib/uvr5_pack/lib_v5/nets_537238KB.py
0 → 100644
View file @
9867304a
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
.
import
layers_537238KB
as
layers
class
BaseASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
nin
,
ch
,
dilations
=
(
4
,
8
,
16
)):
super
(
BaseASPPNet
,
self
).
__init__
()
self
.
enc1
=
layers
.
Encoder
(
nin
,
ch
,
3
,
2
,
1
)
self
.
enc2
=
layers
.
Encoder
(
ch
,
ch
*
2
,
3
,
2
,
1
)
self
.
enc3
=
layers
.
Encoder
(
ch
*
2
,
ch
*
4
,
3
,
2
,
1
)
self
.
enc4
=
layers
.
Encoder
(
ch
*
4
,
ch
*
8
,
3
,
2
,
1
)
self
.
aspp
=
layers
.
ASPPModule
(
ch
*
8
,
ch
*
16
,
dilations
)
self
.
dec4
=
layers
.
Decoder
(
ch
*
(
8
+
16
),
ch
*
8
,
3
,
1
,
1
)
self
.
dec3
=
layers
.
Decoder
(
ch
*
(
4
+
8
),
ch
*
4
,
3
,
1
,
1
)
self
.
dec2
=
layers
.
Decoder
(
ch
*
(
2
+
4
),
ch
*
2
,
3
,
1
,
1
)
self
.
dec1
=
layers
.
Decoder
(
ch
*
(
1
+
2
),
ch
,
3
,
1
,
1
)
def
__call__
(
self
,
x
):
h
,
e1
=
self
.
enc1
(
x
)
h
,
e2
=
self
.
enc2
(
h
)
h
,
e3
=
self
.
enc3
(
h
)
h
,
e4
=
self
.
enc4
(
h
)
h
=
self
.
aspp
(
h
)
h
=
self
.
dec4
(
h
,
e4
)
h
=
self
.
dec3
(
h
,
e3
)
h
=
self
.
dec2
(
h
,
e2
)
h
=
self
.
dec1
(
h
,
e1
)
return
h
class
CascadedASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
n_fft
):
super
(
CascadedASPPNet
,
self
).
__init__
()
self
.
stg1_low_band_net
=
BaseASPPNet
(
2
,
64
)
self
.
stg1_high_band_net
=
BaseASPPNet
(
2
,
64
)
self
.
stg2_bridge
=
layers
.
Conv2DBNActiv
(
66
,
32
,
1
,
1
,
0
)
self
.
stg2_full_band_net
=
BaseASPPNet
(
32
,
64
)
self
.
stg3_bridge
=
layers
.
Conv2DBNActiv
(
130
,
64
,
1
,
1
,
0
)
self
.
stg3_full_band_net
=
BaseASPPNet
(
64
,
128
)
self
.
out
=
nn
.
Conv2d
(
128
,
2
,
1
,
bias
=
False
)
self
.
aux1_out
=
nn
.
Conv2d
(
64
,
2
,
1
,
bias
=
False
)
self
.
aux2_out
=
nn
.
Conv2d
(
64
,
2
,
1
,
bias
=
False
)
self
.
max_bin
=
n_fft
//
2
self
.
output_bin
=
n_fft
//
2
+
1
self
.
offset
=
128
def
forward
(
self
,
x
,
aggressiveness
=
None
):
mix
=
x
.
detach
()
x
=
x
.
clone
()
x
=
x
[:,
:,
:
self
.
max_bin
]
bandw
=
x
.
size
()[
2
]
//
2
aux1
=
torch
.
cat
(
[
self
.
stg1_low_band_net
(
x
[:,
:,
:
bandw
]),
self
.
stg1_high_band_net
(
x
[:,
:,
bandw
:]),
],
dim
=
2
,
)
h
=
torch
.
cat
([
x
,
aux1
],
dim
=
1
)
aux2
=
self
.
stg2_full_band_net
(
self
.
stg2_bridge
(
h
))
h
=
torch
.
cat
([
x
,
aux1
,
aux2
],
dim
=
1
)
h
=
self
.
stg3_full_band_net
(
self
.
stg3_bridge
(
h
))
mask
=
torch
.
sigmoid
(
self
.
out
(
h
))
mask
=
F
.
pad
(
input
=
mask
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
mask
.
size
()[
2
]),
mode
=
"replicate"
,
)
if
self
.
training
:
aux1
=
torch
.
sigmoid
(
self
.
aux1_out
(
aux1
))
aux1
=
F
.
pad
(
input
=
aux1
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux1
.
size
()[
2
]),
mode
=
"replicate"
,
)
aux2
=
torch
.
sigmoid
(
self
.
aux2_out
(
aux2
))
aux2
=
F
.
pad
(
input
=
aux2
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux2
.
size
()[
2
]),
mode
=
"replicate"
,
)
return
mask
*
mix
,
aux1
*
mix
,
aux2
*
mix
else
:
if
aggressiveness
:
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]]
=
torch
.
pow
(
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]],
1
+
aggressiveness
[
"value"
]
/
3
,
)
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:]
=
torch
.
pow
(
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:],
1
+
aggressiveness
[
"value"
],
)
return
mask
*
mix
def
predict
(
self
,
x_mag
,
aggressiveness
=
None
):
h
=
self
.
forward
(
x_mag
,
aggressiveness
)
if
self
.
offset
>
0
:
h
=
h
[:,
:,
:,
self
.
offset
:
-
self
.
offset
]
assert
h
.
size
()[
3
]
>
0
return
h
infer/lib/uvr5_pack/lib_v5/nets_61968KB.py
0 → 100644
View file @
9867304a
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
.
import
layers_123821KB
as
layers
class
BaseASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
nin
,
ch
,
dilations
=
(
4
,
8
,
16
)):
super
(
BaseASPPNet
,
self
).
__init__
()
self
.
enc1
=
layers
.
Encoder
(
nin
,
ch
,
3
,
2
,
1
)
self
.
enc2
=
layers
.
Encoder
(
ch
,
ch
*
2
,
3
,
2
,
1
)
self
.
enc3
=
layers
.
Encoder
(
ch
*
2
,
ch
*
4
,
3
,
2
,
1
)
self
.
enc4
=
layers
.
Encoder
(
ch
*
4
,
ch
*
8
,
3
,
2
,
1
)
self
.
aspp
=
layers
.
ASPPModule
(
ch
*
8
,
ch
*
16
,
dilations
)
self
.
dec4
=
layers
.
Decoder
(
ch
*
(
8
+
16
),
ch
*
8
,
3
,
1
,
1
)
self
.
dec3
=
layers
.
Decoder
(
ch
*
(
4
+
8
),
ch
*
4
,
3
,
1
,
1
)
self
.
dec2
=
layers
.
Decoder
(
ch
*
(
2
+
4
),
ch
*
2
,
3
,
1
,
1
)
self
.
dec1
=
layers
.
Decoder
(
ch
*
(
1
+
2
),
ch
,
3
,
1
,
1
)
def
__call__
(
self
,
x
):
h
,
e1
=
self
.
enc1
(
x
)
h
,
e2
=
self
.
enc2
(
h
)
h
,
e3
=
self
.
enc3
(
h
)
h
,
e4
=
self
.
enc4
(
h
)
h
=
self
.
aspp
(
h
)
h
=
self
.
dec4
(
h
,
e4
)
h
=
self
.
dec3
(
h
,
e3
)
h
=
self
.
dec2
(
h
,
e2
)
h
=
self
.
dec1
(
h
,
e1
)
return
h
class
CascadedASPPNet
(
nn
.
Module
):
def
__init__
(
self
,
n_fft
):
super
(
CascadedASPPNet
,
self
).
__init__
()
self
.
stg1_low_band_net
=
BaseASPPNet
(
2
,
32
)
self
.
stg1_high_band_net
=
BaseASPPNet
(
2
,
32
)
self
.
stg2_bridge
=
layers
.
Conv2DBNActiv
(
34
,
16
,
1
,
1
,
0
)
self
.
stg2_full_band_net
=
BaseASPPNet
(
16
,
32
)
self
.
stg3_bridge
=
layers
.
Conv2DBNActiv
(
66
,
32
,
1
,
1
,
0
)
self
.
stg3_full_band_net
=
BaseASPPNet
(
32
,
64
)
self
.
out
=
nn
.
Conv2d
(
64
,
2
,
1
,
bias
=
False
)
self
.
aux1_out
=
nn
.
Conv2d
(
32
,
2
,
1
,
bias
=
False
)
self
.
aux2_out
=
nn
.
Conv2d
(
32
,
2
,
1
,
bias
=
False
)
self
.
max_bin
=
n_fft
//
2
self
.
output_bin
=
n_fft
//
2
+
1
self
.
offset
=
128
def
forward
(
self
,
x
,
aggressiveness
=
None
):
mix
=
x
.
detach
()
x
=
x
.
clone
()
x
=
x
[:,
:,
:
self
.
max_bin
]
bandw
=
x
.
size
()[
2
]
//
2
aux1
=
torch
.
cat
(
[
self
.
stg1_low_band_net
(
x
[:,
:,
:
bandw
]),
self
.
stg1_high_band_net
(
x
[:,
:,
bandw
:]),
],
dim
=
2
,
)
h
=
torch
.
cat
([
x
,
aux1
],
dim
=
1
)
aux2
=
self
.
stg2_full_band_net
(
self
.
stg2_bridge
(
h
))
h
=
torch
.
cat
([
x
,
aux1
,
aux2
],
dim
=
1
)
h
=
self
.
stg3_full_band_net
(
self
.
stg3_bridge
(
h
))
mask
=
torch
.
sigmoid
(
self
.
out
(
h
))
mask
=
F
.
pad
(
input
=
mask
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
mask
.
size
()[
2
]),
mode
=
"replicate"
,
)
if
self
.
training
:
aux1
=
torch
.
sigmoid
(
self
.
aux1_out
(
aux1
))
aux1
=
F
.
pad
(
input
=
aux1
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux1
.
size
()[
2
]),
mode
=
"replicate"
,
)
aux2
=
torch
.
sigmoid
(
self
.
aux2_out
(
aux2
))
aux2
=
F
.
pad
(
input
=
aux2
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux2
.
size
()[
2
]),
mode
=
"replicate"
,
)
return
mask
*
mix
,
aux1
*
mix
,
aux2
*
mix
else
:
if
aggressiveness
:
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]]
=
torch
.
pow
(
mask
[:,
:,
:
aggressiveness
[
"split_bin"
]],
1
+
aggressiveness
[
"value"
]
/
3
,
)
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:]
=
torch
.
pow
(
mask
[:,
:,
aggressiveness
[
"split_bin"
]
:],
1
+
aggressiveness
[
"value"
],
)
return
mask
*
mix
def
predict
(
self
,
x_mag
,
aggressiveness
=
None
):
h
=
self
.
forward
(
x_mag
,
aggressiveness
)
if
self
.
offset
>
0
:
h
=
h
[:,
:,
:,
self
.
offset
:
-
self
.
offset
]
assert
h
.
size
()[
3
]
>
0
return
h
infer/lib/uvr5_pack/lib_v5/nets_new.py
0 → 100644
View file @
9867304a
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
.
import
layers_new
class
BaseNet
(
nn
.
Module
):
def
__init__
(
self
,
nin
,
nout
,
nin_lstm
,
nout_lstm
,
dilations
=
((
4
,
2
),
(
8
,
4
),
(
12
,
6
))
):
super
(
BaseNet
,
self
).
__init__
()
self
.
enc1
=
layers_new
.
Conv2DBNActiv
(
nin
,
nout
,
3
,
1
,
1
)
self
.
enc2
=
layers_new
.
Encoder
(
nout
,
nout
*
2
,
3
,
2
,
1
)
self
.
enc3
=
layers_new
.
Encoder
(
nout
*
2
,
nout
*
4
,
3
,
2
,
1
)
self
.
enc4
=
layers_new
.
Encoder
(
nout
*
4
,
nout
*
6
,
3
,
2
,
1
)
self
.
enc5
=
layers_new
.
Encoder
(
nout
*
6
,
nout
*
8
,
3
,
2
,
1
)
self
.
aspp
=
layers_new
.
ASPPModule
(
nout
*
8
,
nout
*
8
,
dilations
,
dropout
=
True
)
self
.
dec4
=
layers_new
.
Decoder
(
nout
*
(
6
+
8
),
nout
*
6
,
3
,
1
,
1
)
self
.
dec3
=
layers_new
.
Decoder
(
nout
*
(
4
+
6
),
nout
*
4
,
3
,
1
,
1
)
self
.
dec2
=
layers_new
.
Decoder
(
nout
*
(
2
+
4
),
nout
*
2
,
3
,
1
,
1
)
self
.
lstm_dec2
=
layers_new
.
LSTMModule
(
nout
*
2
,
nin_lstm
,
nout_lstm
)
self
.
dec1
=
layers_new
.
Decoder
(
nout
*
(
1
+
2
)
+
1
,
nout
*
1
,
3
,
1
,
1
)
def
__call__
(
self
,
x
):
e1
=
self
.
enc1
(
x
)
e2
=
self
.
enc2
(
e1
)
e3
=
self
.
enc3
(
e2
)
e4
=
self
.
enc4
(
e3
)
e5
=
self
.
enc5
(
e4
)
h
=
self
.
aspp
(
e5
)
h
=
self
.
dec4
(
h
,
e4
)
h
=
self
.
dec3
(
h
,
e3
)
h
=
self
.
dec2
(
h
,
e2
)
h
=
torch
.
cat
([
h
,
self
.
lstm_dec2
(
h
)],
dim
=
1
)
h
=
self
.
dec1
(
h
,
e1
)
return
h
class
CascadedNet
(
nn
.
Module
):
def
__init__
(
self
,
n_fft
,
nout
=
32
,
nout_lstm
=
128
):
super
(
CascadedNet
,
self
).
__init__
()
self
.
max_bin
=
n_fft
//
2
self
.
output_bin
=
n_fft
//
2
+
1
self
.
nin_lstm
=
self
.
max_bin
//
2
self
.
offset
=
64
self
.
stg1_low_band_net
=
nn
.
Sequential
(
BaseNet
(
2
,
nout
//
2
,
self
.
nin_lstm
//
2
,
nout_lstm
),
layers_new
.
Conv2DBNActiv
(
nout
//
2
,
nout
//
4
,
1
,
1
,
0
),
)
self
.
stg1_high_band_net
=
BaseNet
(
2
,
nout
//
4
,
self
.
nin_lstm
//
2
,
nout_lstm
//
2
)
self
.
stg2_low_band_net
=
nn
.
Sequential
(
BaseNet
(
nout
//
4
+
2
,
nout
,
self
.
nin_lstm
//
2
,
nout_lstm
),
layers_new
.
Conv2DBNActiv
(
nout
,
nout
//
2
,
1
,
1
,
0
),
)
self
.
stg2_high_band_net
=
BaseNet
(
nout
//
4
+
2
,
nout
//
2
,
self
.
nin_lstm
//
2
,
nout_lstm
//
2
)
self
.
stg3_full_band_net
=
BaseNet
(
3
*
nout
//
4
+
2
,
nout
,
self
.
nin_lstm
,
nout_lstm
)
self
.
out
=
nn
.
Conv2d
(
nout
,
2
,
1
,
bias
=
False
)
self
.
aux_out
=
nn
.
Conv2d
(
3
*
nout
//
4
,
2
,
1
,
bias
=
False
)
def
forward
(
self
,
x
):
x
=
x
[:,
:,
:
self
.
max_bin
]
bandw
=
x
.
size
()[
2
]
//
2
l1_in
=
x
[:,
:,
:
bandw
]
h1_in
=
x
[:,
:,
bandw
:]
l1
=
self
.
stg1_low_band_net
(
l1_in
)
h1
=
self
.
stg1_high_band_net
(
h1_in
)
aux1
=
torch
.
cat
([
l1
,
h1
],
dim
=
2
)
l2_in
=
torch
.
cat
([
l1_in
,
l1
],
dim
=
1
)
h2_in
=
torch
.
cat
([
h1_in
,
h1
],
dim
=
1
)
l2
=
self
.
stg2_low_band_net
(
l2_in
)
h2
=
self
.
stg2_high_band_net
(
h2_in
)
aux2
=
torch
.
cat
([
l2
,
h2
],
dim
=
2
)
f3_in
=
torch
.
cat
([
x
,
aux1
,
aux2
],
dim
=
1
)
f3
=
self
.
stg3_full_band_net
(
f3_in
)
mask
=
torch
.
sigmoid
(
self
.
out
(
f3
))
mask
=
F
.
pad
(
input
=
mask
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
mask
.
size
()[
2
]),
mode
=
"replicate"
,
)
if
self
.
training
:
aux
=
torch
.
cat
([
aux1
,
aux2
],
dim
=
1
)
aux
=
torch
.
sigmoid
(
self
.
aux_out
(
aux
))
aux
=
F
.
pad
(
input
=
aux
,
pad
=
(
0
,
0
,
0
,
self
.
output_bin
-
aux
.
size
()[
2
]),
mode
=
"replicate"
,
)
return
mask
,
aux
else
:
return
mask
def
predict_mask
(
self
,
x
):
mask
=
self
.
forward
(
x
)
if
self
.
offset
>
0
:
mask
=
mask
[:,
:,
:,
self
.
offset
:
-
self
.
offset
]
assert
mask
.
size
()[
3
]
>
0
return
mask
def
predict
(
self
,
x
,
aggressiveness
=
None
):
mask
=
self
.
forward
(
x
)
pred_mag
=
x
*
mask
if
self
.
offset
>
0
:
pred_mag
=
pred_mag
[:,
:,
:,
self
.
offset
:
-
self
.
offset
]
assert
pred_mag
.
size
()[
3
]
>
0
return
pred_mag
infer/lib/uvr5_pack/lib_v5/spec_utils.py
0 → 100644
View file @
9867304a
import
hashlib
import
json
import
math
import
os
import
librosa
import
numpy
as
np
import
soundfile
as
sf
from
tqdm
import
tqdm
def
crop_center
(
h1
,
h2
):
h1_shape
=
h1
.
size
()
h2_shape
=
h2
.
size
()
if
h1_shape
[
3
]
==
h2_shape
[
3
]:
return
h1
elif
h1_shape
[
3
]
<
h2_shape
[
3
]:
raise
ValueError
(
"h1_shape[3] must be greater than h2_shape[3]"
)
# s_freq = (h2_shape[2] - h1_shape[2]) // 2
# e_freq = s_freq + h1_shape[2]
s_time
=
(
h1_shape
[
3
]
-
h2_shape
[
3
])
//
2
e_time
=
s_time
+
h2_shape
[
3
]
h1
=
h1
[:,
:,
:,
s_time
:
e_time
]
return
h1
def
wave_to_spectrogram
(
wave
,
hop_length
,
n_fft
,
mid_side
=
False
,
mid_side_b2
=
False
,
reverse
=
False
):
if
reverse
:
wave_left
=
np
.
flip
(
np
.
asfortranarray
(
wave
[
0
]))
wave_right
=
np
.
flip
(
np
.
asfortranarray
(
wave
[
1
]))
elif
mid_side
:
wave_left
=
np
.
asfortranarray
(
np
.
add
(
wave
[
0
],
wave
[
1
])
/
2
)
wave_right
=
np
.
asfortranarray
(
np
.
subtract
(
wave
[
0
],
wave
[
1
]))
elif
mid_side_b2
:
wave_left
=
np
.
asfortranarray
(
np
.
add
(
wave
[
1
],
wave
[
0
]
*
0.5
))
wave_right
=
np
.
asfortranarray
(
np
.
subtract
(
wave
[
0
],
wave
[
1
]
*
0.5
))
else
:
wave_left
=
np
.
asfortranarray
(
wave
[
0
])
wave_right
=
np
.
asfortranarray
(
wave
[
1
])
spec_left
=
librosa
.
stft
(
wave_left
,
n_fft
=
n_fft
,
hop_length
=
hop_length
)
spec_right
=
librosa
.
stft
(
wave_right
,
n_fft
=
n_fft
,
hop_length
=
hop_length
)
spec
=
np
.
asfortranarray
([
spec_left
,
spec_right
])
return
spec
def
wave_to_spectrogram_mt
(
wave
,
hop_length
,
n_fft
,
mid_side
=
False
,
mid_side_b2
=
False
,
reverse
=
False
):
import
threading
if
reverse
:
wave_left
=
np
.
flip
(
np
.
asfortranarray
(
wave
[
0
]))
wave_right
=
np
.
flip
(
np
.
asfortranarray
(
wave
[
1
]))
elif
mid_side
:
wave_left
=
np
.
asfortranarray
(
np
.
add
(
wave
[
0
],
wave
[
1
])
/
2
)
wave_right
=
np
.
asfortranarray
(
np
.
subtract
(
wave
[
0
],
wave
[
1
]))
elif
mid_side_b2
:
wave_left
=
np
.
asfortranarray
(
np
.
add
(
wave
[
1
],
wave
[
0
]
*
0.5
))
wave_right
=
np
.
asfortranarray
(
np
.
subtract
(
wave
[
0
],
wave
[
1
]
*
0.5
))
else
:
wave_left
=
np
.
asfortranarray
(
wave
[
0
])
wave_right
=
np
.
asfortranarray
(
wave
[
1
])
def
run_thread
(
**
kwargs
):
global
spec_left
spec_left
=
librosa
.
stft
(
**
kwargs
)
thread
=
threading
.
Thread
(
target
=
run_thread
,
kwargs
=
{
"y"
:
wave_left
,
"n_fft"
:
n_fft
,
"hop_length"
:
hop_length
},
)
thread
.
start
()
spec_right
=
librosa
.
stft
(
wave_right
,
n_fft
=
n_fft
,
hop_length
=
hop_length
)
thread
.
join
()
spec
=
np
.
asfortranarray
([
spec_left
,
spec_right
])
return
spec
def
combine_spectrograms
(
specs
,
mp
):
l
=
min
([
specs
[
i
].
shape
[
2
]
for
i
in
specs
])
spec_c
=
np
.
zeros
(
shape
=
(
2
,
mp
.
param
[
"bins"
]
+
1
,
l
),
dtype
=
np
.
complex64
)
offset
=
0
bands_n
=
len
(
mp
.
param
[
"band"
])
for
d
in
range
(
1
,
bands_n
+
1
):
h
=
mp
.
param
[
"band"
][
d
][
"crop_stop"
]
-
mp
.
param
[
"band"
][
d
][
"crop_start"
]
spec_c
[:,
offset
:
offset
+
h
,
:
l
]
=
specs
[
d
][
:,
mp
.
param
[
"band"
][
d
][
"crop_start"
]
:
mp
.
param
[
"band"
][
d
][
"crop_stop"
],
:
l
]
offset
+=
h
if
offset
>
mp
.
param
[
"bins"
]:
raise
ValueError
(
"Too much bins"
)
# lowpass fiter
if
(
mp
.
param
[
"pre_filter_start"
]
>
0
):
# and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if
bands_n
==
1
:
spec_c
=
fft_lp_filter
(
spec_c
,
mp
.
param
[
"pre_filter_start"
],
mp
.
param
[
"pre_filter_stop"
]
)
else
:
gp
=
1
for
b
in
range
(
mp
.
param
[
"pre_filter_start"
]
+
1
,
mp
.
param
[
"pre_filter_stop"
]
):
g
=
math
.
pow
(
10
,
-
(
b
-
mp
.
param
[
"pre_filter_start"
])
*
(
3.5
-
gp
)
/
20.0
)
gp
=
g
spec_c
[:,
b
,
:]
*=
g
return
np
.
asfortranarray
(
spec_c
)
def
spectrogram_to_image
(
spec
,
mode
=
"magnitude"
):
if
mode
==
"magnitude"
:
if
np
.
iscomplexobj
(
spec
):
y
=
np
.
abs
(
spec
)
else
:
y
=
spec
y
=
np
.
log10
(
y
**
2
+
1e-8
)
elif
mode
==
"phase"
:
if
np
.
iscomplexobj
(
spec
):
y
=
np
.
angle
(
spec
)
else
:
y
=
spec
y
-=
y
.
min
()
y
*=
255
/
y
.
max
()
img
=
np
.
uint8
(
y
)
if
y
.
ndim
==
3
:
img
=
img
.
transpose
(
1
,
2
,
0
)
img
=
np
.
concatenate
([
np
.
max
(
img
,
axis
=
2
,
keepdims
=
True
),
img
],
axis
=
2
)
return
img
def
reduce_vocal_aggressively
(
X
,
y
,
softmask
):
v
=
X
-
y
y_mag_tmp
=
np
.
abs
(
y
)
v_mag_tmp
=
np
.
abs
(
v
)
v_mask
=
v_mag_tmp
>
y_mag_tmp
y_mag
=
np
.
clip
(
y_mag_tmp
-
v_mag_tmp
*
v_mask
*
softmask
,
0
,
np
.
inf
)
return
y_mag
*
np
.
exp
(
1.0j
*
np
.
angle
(
y
))
def
mask_silence
(
mag
,
ref
,
thres
=
0.2
,
min_range
=
64
,
fade_size
=
32
):
if
min_range
<
fade_size
*
2
:
raise
ValueError
(
"min_range must be >= fade_area * 2"
)
mag
=
mag
.
copy
()
idx
=
np
.
where
(
ref
.
mean
(
axis
=
(
0
,
1
))
<
thres
)[
0
]
starts
=
np
.
insert
(
idx
[
np
.
where
(
np
.
diff
(
idx
)
!=
1
)[
0
]
+
1
],
0
,
idx
[
0
])
ends
=
np
.
append
(
idx
[
np
.
where
(
np
.
diff
(
idx
)
!=
1
)[
0
]],
idx
[
-
1
])
uninformative
=
np
.
where
(
ends
-
starts
>
min_range
)[
0
]
if
len
(
uninformative
)
>
0
:
starts
=
starts
[
uninformative
]
ends
=
ends
[
uninformative
]
old_e
=
None
for
s
,
e
in
zip
(
starts
,
ends
):
if
old_e
is
not
None
and
s
-
old_e
<
fade_size
:
s
=
old_e
-
fade_size
*
2
if
s
!=
0
:
weight
=
np
.
linspace
(
0
,
1
,
fade_size
)
mag
[:,
:,
s
:
s
+
fade_size
]
+=
weight
*
ref
[:,
:,
s
:
s
+
fade_size
]
else
:
s
-=
fade_size
if
e
!=
mag
.
shape
[
2
]:
weight
=
np
.
linspace
(
1
,
0
,
fade_size
)
mag
[:,
:,
e
-
fade_size
:
e
]
+=
weight
*
ref
[:,
:,
e
-
fade_size
:
e
]
else
:
e
+=
fade_size
mag
[:,
:,
s
+
fade_size
:
e
-
fade_size
]
+=
ref
[
:,
:,
s
+
fade_size
:
e
-
fade_size
]
old_e
=
e
return
mag
def
align_wave_head_and_tail
(
a
,
b
):
l
=
min
([
a
[
0
].
size
,
b
[
0
].
size
])
return
a
[:
l
,
:
l
],
b
[:
l
,
:
l
]
def
cache_or_load
(
mix_path
,
inst_path
,
mp
):
mix_basename
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
mix_path
))[
0
]
inst_basename
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
inst_path
))[
0
]
cache_dir
=
"mph{}"
.
format
(
hashlib
.
sha1
(
json
.
dumps
(
mp
.
param
,
sort_keys
=
True
).
encode
(
"utf-8"
)).
hexdigest
()
)
mix_cache_dir
=
os
.
path
.
join
(
"cache"
,
cache_dir
)
inst_cache_dir
=
os
.
path
.
join
(
"cache"
,
cache_dir
)
os
.
makedirs
(
mix_cache_dir
,
exist_ok
=
True
)
os
.
makedirs
(
inst_cache_dir
,
exist_ok
=
True
)
mix_cache_path
=
os
.
path
.
join
(
mix_cache_dir
,
mix_basename
+
".npy"
)
inst_cache_path
=
os
.
path
.
join
(
inst_cache_dir
,
inst_basename
+
".npy"
)
if
os
.
path
.
exists
(
mix_cache_path
)
and
os
.
path
.
exists
(
inst_cache_path
):
X_spec_m
=
np
.
load
(
mix_cache_path
)
y_spec_m
=
np
.
load
(
inst_cache_path
)
else
:
X_wave
,
y_wave
,
X_spec_s
,
y_spec_s
=
{},
{},
{},
{}
for
d
in
range
(
len
(
mp
.
param
[
"band"
]),
0
,
-
1
):
bp
=
mp
.
param
[
"band"
][
d
]
if
d
==
len
(
mp
.
param
[
"band"
]):
# high-end band
X_wave
[
d
],
_
=
librosa
.
load
(
mix_path
,
sr
=
bp
[
"sr"
],
mono
=
False
,
dtype
=
np
.
float32
,
res_type
=
bp
[
"res_type"
]
)
y_wave
[
d
],
_
=
librosa
.
load
(
inst_path
,
sr
=
bp
[
"sr"
],
mono
=
False
,
dtype
=
np
.
float32
,
res_type
=
bp
[
"res_type"
],
)
else
:
# lower bands
X_wave
[
d
]
=
librosa
.
resample
(
X_wave
[
d
+
1
],
orig_sr
=
mp
.
param
[
"band"
][
d
+
1
][
"sr"
],
target_sr
=
bp
[
"sr"
],
res_type
=
bp
[
"res_type"
],
)
y_wave
[
d
]
=
librosa
.
resample
(
y_wave
[
d
+
1
],
orig_sr
=
mp
.
param
[
"band"
][
d
+
1
][
"sr"
],
target_sr
=
bp
[
"sr"
],
res_type
=
bp
[
"res_type"
],
)
X_wave
[
d
],
y_wave
[
d
]
=
align_wave_head_and_tail
(
X_wave
[
d
],
y_wave
[
d
])
X_spec_s
[
d
]
=
wave_to_spectrogram
(
X_wave
[
d
],
bp
[
"hl"
],
bp
[
"n_fft"
],
mp
.
param
[
"mid_side"
],
mp
.
param
[
"mid_side_b2"
],
mp
.
param
[
"reverse"
],
)
y_spec_s
[
d
]
=
wave_to_spectrogram
(
y_wave
[
d
],
bp
[
"hl"
],
bp
[
"n_fft"
],
mp
.
param
[
"mid_side"
],
mp
.
param
[
"mid_side_b2"
],
mp
.
param
[
"reverse"
],
)
del
X_wave
,
y_wave
X_spec_m
=
combine_spectrograms
(
X_spec_s
,
mp
)
y_spec_m
=
combine_spectrograms
(
y_spec_s
,
mp
)
if
X_spec_m
.
shape
!=
y_spec_m
.
shape
:
raise
ValueError
(
"The combined spectrograms are different: "
+
mix_path
)
_
,
ext
=
os
.
path
.
splitext
(
mix_path
)
np
.
save
(
mix_cache_path
,
X_spec_m
)
np
.
save
(
inst_cache_path
,
y_spec_m
)
return
X_spec_m
,
y_spec_m
def
spectrogram_to_wave
(
spec
,
hop_length
,
mid_side
,
mid_side_b2
,
reverse
):
spec_left
=
np
.
asfortranarray
(
spec
[
0
])
spec_right
=
np
.
asfortranarray
(
spec
[
1
])
wave_left
=
librosa
.
istft
(
spec_left
,
hop_length
=
hop_length
)
wave_right
=
librosa
.
istft
(
spec_right
,
hop_length
=
hop_length
)
if
reverse
:
return
np
.
asfortranarray
([
np
.
flip
(
wave_left
),
np
.
flip
(
wave_right
)])
elif
mid_side
:
return
np
.
asfortranarray
(
[
np
.
add
(
wave_left
,
wave_right
/
2
),
np
.
subtract
(
wave_left
,
wave_right
/
2
)]
)
elif
mid_side_b2
:
return
np
.
asfortranarray
(
[
np
.
add
(
wave_right
/
1.25
,
0.4
*
wave_left
),
np
.
subtract
(
wave_left
/
1.25
,
0.4
*
wave_right
),
]
)
else
:
return
np
.
asfortranarray
([
wave_left
,
wave_right
])
def
spectrogram_to_wave_mt
(
spec
,
hop_length
,
mid_side
,
reverse
,
mid_side_b2
):
import
threading
spec_left
=
np
.
asfortranarray
(
spec
[
0
])
spec_right
=
np
.
asfortranarray
(
spec
[
1
])
def
run_thread
(
**
kwargs
):
global
wave_left
wave_left
=
librosa
.
istft
(
**
kwargs
)
thread
=
threading
.
Thread
(
target
=
run_thread
,
kwargs
=
{
"stft_matrix"
:
spec_left
,
"hop_length"
:
hop_length
}
)
thread
.
start
()
wave_right
=
librosa
.
istft
(
spec_right
,
hop_length
=
hop_length
)
thread
.
join
()
if
reverse
:
return
np
.
asfortranarray
([
np
.
flip
(
wave_left
),
np
.
flip
(
wave_right
)])
elif
mid_side
:
return
np
.
asfortranarray
(
[
np
.
add
(
wave_left
,
wave_right
/
2
),
np
.
subtract
(
wave_left
,
wave_right
/
2
)]
)
elif
mid_side_b2
:
return
np
.
asfortranarray
(
[
np
.
add
(
wave_right
/
1.25
,
0.4
*
wave_left
),
np
.
subtract
(
wave_left
/
1.25
,
0.4
*
wave_right
),
]
)
else
:
return
np
.
asfortranarray
([
wave_left
,
wave_right
])
def
cmb_spectrogram_to_wave
(
spec_m
,
mp
,
extra_bins_h
=
None
,
extra_bins
=
None
):
wave_band
=
{}
bands_n
=
len
(
mp
.
param
[
"band"
])
offset
=
0
for
d
in
range
(
1
,
bands_n
+
1
):
bp
=
mp
.
param
[
"band"
][
d
]
spec_s
=
np
.
ndarray
(
shape
=
(
2
,
bp
[
"n_fft"
]
//
2
+
1
,
spec_m
.
shape
[
2
]),
dtype
=
complex
)
h
=
bp
[
"crop_stop"
]
-
bp
[
"crop_start"
]
spec_s
[:,
bp
[
"crop_start"
]
:
bp
[
"crop_stop"
],
:]
=
spec_m
[
:,
offset
:
offset
+
h
,
:
]
offset
+=
h
if
d
==
bands_n
:
# higher
if
extra_bins_h
:
# if --high_end_process bypass
max_bin
=
bp
[
"n_fft"
]
//
2
spec_s
[:,
max_bin
-
extra_bins_h
:
max_bin
,
:]
=
extra_bins
[
:,
:
extra_bins_h
,
:
]
if
bp
[
"hpf_start"
]
>
0
:
spec_s
=
fft_hp_filter
(
spec_s
,
bp
[
"hpf_start"
],
bp
[
"hpf_stop"
]
-
1
)
if
bands_n
==
1
:
wave
=
spectrogram_to_wave
(
spec_s
,
bp
[
"hl"
],
mp
.
param
[
"mid_side"
],
mp
.
param
[
"mid_side_b2"
],
mp
.
param
[
"reverse"
],
)
else
:
wave
=
np
.
add
(
wave
,
spectrogram_to_wave
(
spec_s
,
bp
[
"hl"
],
mp
.
param
[
"mid_side"
],
mp
.
param
[
"mid_side_b2"
],
mp
.
param
[
"reverse"
],
),
)
else
:
sr
=
mp
.
param
[
"band"
][
d
+
1
][
"sr"
]
if
d
==
1
:
# lower
spec_s
=
fft_lp_filter
(
spec_s
,
bp
[
"lpf_start"
],
bp
[
"lpf_stop"
])
wave
=
librosa
.
resample
(
spectrogram_to_wave
(
spec_s
,
bp
[
"hl"
],
mp
.
param
[
"mid_side"
],
mp
.
param
[
"mid_side_b2"
],
mp
.
param
[
"reverse"
],
),
orig_sr
=
bp
[
"sr"
],
target_sr
=
sr
,
res_type
=
"sinc_fastest"
,
)
else
:
# mid
spec_s
=
fft_hp_filter
(
spec_s
,
bp
[
"hpf_start"
],
bp
[
"hpf_stop"
]
-
1
)
spec_s
=
fft_lp_filter
(
spec_s
,
bp
[
"lpf_start"
],
bp
[
"lpf_stop"
])
wave2
=
np
.
add
(
wave
,
spectrogram_to_wave
(
spec_s
,
bp
[
"hl"
],
mp
.
param
[
"mid_side"
],
mp
.
param
[
"mid_side_b2"
],
mp
.
param
[
"reverse"
],
),
)
# wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
wave
=
librosa
.
resample
(
wave2
,
orig_sr
=
bp
[
"sr"
],
target_sr
=
sr
,
res_type
=
"scipy"
)
return
wave
.
T
def
fft_lp_filter
(
spec
,
bin_start
,
bin_stop
):
g
=
1.0
for
b
in
range
(
bin_start
,
bin_stop
):
g
-=
1
/
(
bin_stop
-
bin_start
)
spec
[:,
b
,
:]
=
g
*
spec
[:,
b
,
:]
spec
[:,
bin_stop
:,
:]
*=
0
return
spec
def
fft_hp_filter
(
spec
,
bin_start
,
bin_stop
):
g
=
1.0
for
b
in
range
(
bin_start
,
bin_stop
,
-
1
):
g
-=
1
/
(
bin_start
-
bin_stop
)
spec
[:,
b
,
:]
=
g
*
spec
[:,
b
,
:]
spec
[:,
0
:
bin_stop
+
1
,
:]
*=
0
return
spec
def
mirroring
(
a
,
spec_m
,
input_high_end
,
mp
):
if
"mirroring"
==
a
:
mirror
=
np
.
flip
(
np
.
abs
(
spec_m
[
:,
mp
.
param
[
"pre_filter_start"
]
-
10
-
input_high_end
.
shape
[
1
]
:
mp
.
param
[
"pre_filter_start"
]
-
10
,
:,
]
),
1
,
)
mirror
=
mirror
*
np
.
exp
(
1.0j
*
np
.
angle
(
input_high_end
))
return
np
.
where
(
np
.
abs
(
input_high_end
)
<=
np
.
abs
(
mirror
),
input_high_end
,
mirror
)
if
"mirroring2"
==
a
:
mirror
=
np
.
flip
(
np
.
abs
(
spec_m
[
:,
mp
.
param
[
"pre_filter_start"
]
-
10
-
input_high_end
.
shape
[
1
]
:
mp
.
param
[
"pre_filter_start"
]
-
10
,
:,
]
),
1
,
)
mi
=
np
.
multiply
(
mirror
,
input_high_end
*
1.7
)
return
np
.
where
(
np
.
abs
(
input_high_end
)
<=
np
.
abs
(
mi
),
input_high_end
,
mi
)
def
ensembling
(
a
,
specs
):
for
i
in
range
(
1
,
len
(
specs
)):
if
i
==
1
:
spec
=
specs
[
0
]
ln
=
min
([
spec
.
shape
[
2
],
specs
[
i
].
shape
[
2
]])
spec
=
spec
[:,
:,
:
ln
]
specs
[
i
]
=
specs
[
i
][:,
:,
:
ln
]
if
"min_mag"
==
a
:
spec
=
np
.
where
(
np
.
abs
(
specs
[
i
])
<=
np
.
abs
(
spec
),
specs
[
i
],
spec
)
if
"max_mag"
==
a
:
spec
=
np
.
where
(
np
.
abs
(
specs
[
i
])
>=
np
.
abs
(
spec
),
specs
[
i
],
spec
)
return
spec
def
stft
(
wave
,
nfft
,
hl
):
wave_left
=
np
.
asfortranarray
(
wave
[
0
])
wave_right
=
np
.
asfortranarray
(
wave
[
1
])
spec_left
=
librosa
.
stft
(
wave_left
,
n_fft
=
nfft
,
hop_length
=
hl
)
spec_right
=
librosa
.
stft
(
wave_right
,
n_fft
=
nfft
,
hop_length
=
hl
)
spec
=
np
.
asfortranarray
([
spec_left
,
spec_right
])
return
spec
def
istft
(
spec
,
hl
):
spec_left
=
np
.
asfortranarray
(
spec
[
0
])
spec_right
=
np
.
asfortranarray
(
spec
[
1
])
wave_left
=
librosa
.
istft
(
spec_left
,
hop_length
=
hl
)
wave_right
=
librosa
.
istft
(
spec_right
,
hop_length
=
hl
)
wave
=
np
.
asfortranarray
([
wave_left
,
wave_right
])
if
__name__
==
"__main__"
:
import
argparse
import
sys
import
time
import
cv2
from
model_param_init
import
ModelParameters
p
=
argparse
.
ArgumentParser
()
p
.
add_argument
(
"--algorithm"
,
"-a"
,
type
=
str
,
choices
=
[
"invert"
,
"invert_p"
,
"min_mag"
,
"max_mag"
,
"deep"
,
"align"
],
default
=
"min_mag"
,
)
p
.
add_argument
(
"--model_params"
,
"-m"
,
type
=
str
,
default
=
os
.
path
.
join
(
"modelparams"
,
"1band_sr44100_hl512.json"
),
)
p
.
add_argument
(
"--output_name"
,
"-o"
,
type
=
str
,
default
=
"output"
)
p
.
add_argument
(
"--vocals_only"
,
"-v"
,
action
=
"store_true"
)
p
.
add_argument
(
"input"
,
nargs
=
"+"
)
args
=
p
.
parse_args
()
start_time
=
time
.
time
()
if
args
.
algorithm
.
startswith
(
"invert"
)
and
len
(
args
.
input
)
!=
2
:
raise
ValueError
(
"There should be two input files."
)
if
not
args
.
algorithm
.
startswith
(
"invert"
)
and
len
(
args
.
input
)
<
2
:
raise
ValueError
(
"There must be at least two input files."
)
wave
,
specs
=
{},
{}
mp
=
ModelParameters
(
args
.
model_params
)
for
i
in
range
(
len
(
args
.
input
)):
spec
=
{}
for
d
in
range
(
len
(
mp
.
param
[
"band"
]),
0
,
-
1
):
bp
=
mp
.
param
[
"band"
][
d
]
if
d
==
len
(
mp
.
param
[
"band"
]):
# high-end band
wave
[
d
],
_
=
librosa
.
load
(
args
.
input
[
i
],
sr
=
bp
[
"sr"
],
mono
=
False
,
dtype
=
np
.
float32
,
res_type
=
bp
[
"res_type"
],
)
if
len
(
wave
[
d
].
shape
)
==
1
:
# mono to stereo
wave
[
d
]
=
np
.
array
([
wave
[
d
],
wave
[
d
]])
else
:
# lower bands
wave
[
d
]
=
librosa
.
resample
(
wave
[
d
+
1
],
orig_sr
=
mp
.
param
[
"band"
][
d
+
1
][
"sr"
],
target_sr
=
bp
[
"sr"
],
res_type
=
bp
[
"res_type"
],
)
spec
[
d
]
=
wave_to_spectrogram
(
wave
[
d
],
bp
[
"hl"
],
bp
[
"n_fft"
],
mp
.
param
[
"mid_side"
],
mp
.
param
[
"mid_side_b2"
],
mp
.
param
[
"reverse"
],
)
specs
[
i
]
=
combine_spectrograms
(
spec
,
mp
)
del
wave
if
args
.
algorithm
==
"deep"
:
d_spec
=
np
.
where
(
np
.
abs
(
specs
[
0
])
<=
np
.
abs
(
spec
[
1
]),
specs
[
0
],
spec
[
1
])
v_spec
=
d_spec
-
specs
[
1
]
sf
.
write
(
os
.
path
.
join
(
"{}.wav"
.
format
(
args
.
output_name
)),
cmb_spectrogram_to_wave
(
v_spec
,
mp
),
mp
.
param
[
"sr"
],
)
if
args
.
algorithm
.
startswith
(
"invert"
):
ln
=
min
([
specs
[
0
].
shape
[
2
],
specs
[
1
].
shape
[
2
]])
specs
[
0
]
=
specs
[
0
][:,
:,
:
ln
]
specs
[
1
]
=
specs
[
1
][:,
:,
:
ln
]
if
"invert_p"
==
args
.
algorithm
:
X_mag
=
np
.
abs
(
specs
[
0
])
y_mag
=
np
.
abs
(
specs
[
1
])
max_mag
=
np
.
where
(
X_mag
>=
y_mag
,
X_mag
,
y_mag
)
v_spec
=
specs
[
1
]
-
max_mag
*
np
.
exp
(
1.0j
*
np
.
angle
(
specs
[
0
]))
else
:
specs
[
1
]
=
reduce_vocal_aggressively
(
specs
[
0
],
specs
[
1
],
0.2
)
v_spec
=
specs
[
0
]
-
specs
[
1
]
if
not
args
.
vocals_only
:
X_mag
=
np
.
abs
(
specs
[
0
])
y_mag
=
np
.
abs
(
specs
[
1
])
v_mag
=
np
.
abs
(
v_spec
)
X_image
=
spectrogram_to_image
(
X_mag
)
y_image
=
spectrogram_to_image
(
y_mag
)
v_image
=
spectrogram_to_image
(
v_mag
)
cv2
.
imwrite
(
"{}_X.png"
.
format
(
args
.
output_name
),
X_image
)
cv2
.
imwrite
(
"{}_y.png"
.
format
(
args
.
output_name
),
y_image
)
cv2
.
imwrite
(
"{}_v.png"
.
format
(
args
.
output_name
),
v_image
)
sf
.
write
(
"{}_X.wav"
.
format
(
args
.
output_name
),
cmb_spectrogram_to_wave
(
specs
[
0
],
mp
),
mp
.
param
[
"sr"
],
)
sf
.
write
(
"{}_y.wav"
.
format
(
args
.
output_name
),
cmb_spectrogram_to_wave
(
specs
[
1
],
mp
),
mp
.
param
[
"sr"
],
)
sf
.
write
(
"{}_v.wav"
.
format
(
args
.
output_name
),
cmb_spectrogram_to_wave
(
v_spec
,
mp
),
mp
.
param
[
"sr"
],
)
else
:
if
not
args
.
algorithm
==
"deep"
:
sf
.
write
(
os
.
path
.
join
(
"ensembled"
,
"{}.wav"
.
format
(
args
.
output_name
)),
cmb_spectrogram_to_wave
(
ensembling
(
args
.
algorithm
,
specs
),
mp
),
mp
.
param
[
"sr"
],
)
if
args
.
algorithm
==
"align"
:
trackalignment
=
[
{
"file1"
:
'"{}"'
.
format
(
args
.
input
[
0
]),
"file2"
:
'"{}"'
.
format
(
args
.
input
[
1
]),
}
]
for
i
,
e
in
tqdm
(
enumerate
(
trackalignment
),
desc
=
"Performing Alignment..."
):
os
.
system
(
f
"python lib/align_tracks.py
{
e
[
'file1'
]
}
{
e
[
'file2'
]
}
"
)
# print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
infer/lib/uvr5_pack/name_params.json
0 → 100644
View file @
9867304a
{
"equivalent"
:
[
{
"model_hash_name"
:
[
{
"hash_name"
:
"47939caf0cfe52a0e81442b85b971dfd"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json"
,
"param_name"
:
"4band_44100"
},
{
"hash_name"
:
"4e4ecb9764c50a8c414fee6e10395bbe"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json"
,
"param_name"
:
"4band_v2"
},
{
"hash_name"
:
"ca106edd563e034bde0bdec4bb7a4b36"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json"
,
"param_name"
:
"4band_v2"
},
{
"hash_name"
:
"e60a1e84803ce4efc0a6551206cc4b71"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json"
,
"param_name"
:
"4band_44100"
},
{
"hash_name"
:
"a82f14e75892e55e994376edbf0c8435"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json"
,
"param_name"
:
"4band_44100"
},
{
"hash_name"
:
"6dd9eaa6f0420af9f1d403aaafa4cc06"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json"
,
"param_name"
:
"4band_v2_sn"
},
{
"hash_name"
:
"08611fb99bd59eaa79ad27c58d137727"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json"
,
"param_name"
:
"4band_v2_sn"
},
{
"hash_name"
:
"5c7bbca45a187e81abbbd351606164e5"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json"
,
"param_name"
:
"3band_44100_msb2"
},
{
"hash_name"
:
"d6b2cb685a058a091e5e7098192d3233"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json"
,
"param_name"
:
"3band_44100_msb2"
},
{
"hash_name"
:
"c1b9f38170a7c90e96f027992eb7c62b"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json"
,
"param_name"
:
"4band_44100"
},
{
"hash_name"
:
"c3448ec923fa0edf3d03a19e633faa53"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json"
,
"param_name"
:
"4band_44100"
},
{
"hash_name"
:
"68aa2c8093d0080704b200d140f59e54"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json"
,
"param_name"
:
"3band_44100"
},
{
"hash_name"
:
"fdc83be5b798e4bd29fe00fe6600e147"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json"
,
"param_name"
:
"3band_44100_mid.json"
},
{
"hash_name"
:
"2ce34bc92fd57f55db16b7a4def3d745"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json"
,
"param_name"
:
"3band_44100_mid.json"
},
{
"hash_name"
:
"52fdca89576f06cf4340b74a4730ee5f"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json"
,
"param_name"
:
"4band_44100.json"
},
{
"hash_name"
:
"41191165b05d38fc77f072fa9e8e8a30"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json"
,
"param_name"
:
"4band_44100.json"
},
{
"hash_name"
:
"89e83b511ad474592689e562d5b1f80e"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json"
,
"param_name"
:
"2band_32000.json"
},
{
"hash_name"
:
"0b954da81d453b716b114d6d7c95177f"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json"
,
"param_name"
:
"2band_32000.json"
}
],
"v4 Models"
:
[
{
"hash_name"
:
"6a00461c51c2920fd68937d4609ed6c8"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json"
,
"param_name"
:
"1band_sr16000_hl512"
},
{
"hash_name"
:
"0ab504864d20f1bd378fe9c81ef37140"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json"
,
"param_name"
:
"1band_sr32000_hl512"
},
{
"hash_name"
:
"7dd21065bf91c10f7fccb57d7d83b07f"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json"
,
"param_name"
:
"1band_sr32000_hl512"
},
{
"hash_name"
:
"80ab74d65e515caa3622728d2de07d23"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json"
,
"param_name"
:
"1band_sr32000_hl512"
},
{
"hash_name"
:
"edc115e7fc523245062200c00caa847f"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json"
,
"param_name"
:
"1band_sr33075_hl384"
},
{
"hash_name"
:
"28063e9f6ab5b341c5f6d3c67f2045b7"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json"
,
"param_name"
:
"1band_sr33075_hl384"
},
{
"hash_name"
:
"b58090534c52cbc3e9b5104bad666ef2"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json"
,
"param_name"
:
"1band_sr44100_hl512"
},
{
"hash_name"
:
"0cdab9947f1b0928705f518f3c78ea8f"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json"
,
"param_name"
:
"1band_sr44100_hl512"
},
{
"hash_name"
:
"ae702fed0238afb5346db8356fe25f13"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json"
,
"param_name"
:
"1band_sr44100_hl1024"
}
]
}
],
"User Models"
:
[
{
"1 Band"
:
[
{
"hash_name"
:
"1band_sr16000_hl512"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json"
,
"param_name"
:
"1band_sr16000_hl512"
},
{
"hash_name"
:
"1band_sr32000_hl512"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json"
,
"param_name"
:
"1band_sr16000_hl512"
},
{
"hash_name"
:
"1band_sr33075_hl384"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json"
,
"param_name"
:
"1band_sr33075_hl384"
},
{
"hash_name"
:
"1band_sr44100_hl256"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json"
,
"param_name"
:
"1band_sr44100_hl256"
},
{
"hash_name"
:
"1band_sr44100_hl512"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json"
,
"param_name"
:
"1band_sr44100_hl512"
},
{
"hash_name"
:
"1band_sr44100_hl1024"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json"
,
"param_name"
:
"1band_sr44100_hl1024"
}
],
"2 Band"
:
[
{
"hash_name"
:
"2band_44100_lofi"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json"
,
"param_name"
:
"2band_44100_lofi"
},
{
"hash_name"
:
"2band_32000"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json"
,
"param_name"
:
"2band_32000"
},
{
"hash_name"
:
"2band_48000"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json"
,
"param_name"
:
"2band_48000"
}
],
"3 Band"
:
[
{
"hash_name"
:
"3band_44100"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json"
,
"param_name"
:
"3band_44100"
},
{
"hash_name"
:
"3band_44100_mid"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json"
,
"param_name"
:
"3band_44100_mid"
},
{
"hash_name"
:
"3band_44100_msb2"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json"
,
"param_name"
:
"3band_44100_msb2"
}
],
"4 Band"
:
[
{
"hash_name"
:
"4band_44100"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json"
,
"param_name"
:
"4band_44100"
},
{
"hash_name"
:
"4band_44100_mid"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json"
,
"param_name"
:
"4band_44100_mid"
},
{
"hash_name"
:
"4band_44100_msb"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json"
,
"param_name"
:
"4band_44100_msb"
},
{
"hash_name"
:
"4band_44100_msb2"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json"
,
"param_name"
:
"4band_44100_msb2"
},
{
"hash_name"
:
"4band_44100_reverse"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json"
,
"param_name"
:
"4band_44100_reverse"
},
{
"hash_name"
:
"4band_44100_sw"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json"
,
"param_name"
:
"4band_44100_sw"
},
{
"hash_name"
:
"4band_v2"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json"
,
"param_name"
:
"4band_v2"
},
{
"hash_name"
:
"4band_v2_sn"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json"
,
"param_name"
:
"4band_v2_sn"
},
{
"hash_name"
:
"tmodelparam"
,
"model_params"
:
"infer/lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json"
,
"param_name"
:
"User Model Param Set"
}
]
}
]
}
\ No newline at end of file
infer/lib/uvr5_pack/utils.py
0 → 100644
View file @
9867304a
import
json
import
numpy
as
np
import
torch
from
tqdm
import
tqdm
def
load_data
(
file_name
:
str
=
"./infer/lib/uvr5_pack/name_params.json"
)
->
dict
:
with
open
(
file_name
,
"r"
)
as
f
:
data
=
json
.
load
(
f
)
return
data
def
make_padding
(
width
,
cropsize
,
offset
):
left
=
offset
roi_size
=
cropsize
-
left
*
2
if
roi_size
==
0
:
roi_size
=
cropsize
right
=
roi_size
-
(
width
%
roi_size
)
+
left
return
left
,
right
,
roi_size
def
inference
(
X_spec
,
device
,
model
,
aggressiveness
,
data
):
"""
data : dic configs
"""
def
_execute
(
X_mag_pad
,
roi_size
,
n_window
,
device
,
model
,
aggressiveness
,
is_half
=
True
):
model
.
eval
()
with
torch
.
no_grad
():
preds
=
[]
iterations
=
[
n_window
]
total_iterations
=
sum
(
iterations
)
for
i
in
tqdm
(
range
(
n_window
)):
start
=
i
*
roi_size
X_mag_window
=
X_mag_pad
[
None
,
:,
:,
start
:
start
+
data
[
"window_size"
]
]
X_mag_window
=
torch
.
from_numpy
(
X_mag_window
)
if
is_half
:
X_mag_window
=
X_mag_window
.
half
()
X_mag_window
=
X_mag_window
.
to
(
device
)
pred
=
model
.
predict
(
X_mag_window
,
aggressiveness
)
pred
=
pred
.
detach
().
cpu
().
numpy
()
preds
.
append
(
pred
[
0
])
pred
=
np
.
concatenate
(
preds
,
axis
=
2
)
return
pred
def
preprocess
(
X_spec
):
X_mag
=
np
.
abs
(
X_spec
)
X_phase
=
np
.
angle
(
X_spec
)
return
X_mag
,
X_phase
X_mag
,
X_phase
=
preprocess
(
X_spec
)
coef
=
X_mag
.
max
()
X_mag_pre
=
X_mag
/
coef
n_frame
=
X_mag_pre
.
shape
[
2
]
pad_l
,
pad_r
,
roi_size
=
make_padding
(
n_frame
,
data
[
"window_size"
],
model
.
offset
)
n_window
=
int
(
np
.
ceil
(
n_frame
/
roi_size
))
X_mag_pad
=
np
.
pad
(
X_mag_pre
,
((
0
,
0
),
(
0
,
0
),
(
pad_l
,
pad_r
)),
mode
=
"constant"
)
if
list
(
model
.
state_dict
().
values
())[
0
].
dtype
==
torch
.
float16
:
is_half
=
True
else
:
is_half
=
False
pred
=
_execute
(
X_mag_pad
,
roi_size
,
n_window
,
device
,
model
,
aggressiveness
,
is_half
)
pred
=
pred
[:,
:,
:
n_frame
]
if
data
[
"tta"
]:
pad_l
+=
roi_size
//
2
pad_r
+=
roi_size
//
2
n_window
+=
1
X_mag_pad
=
np
.
pad
(
X_mag_pre
,
((
0
,
0
),
(
0
,
0
),
(
pad_l
,
pad_r
)),
mode
=
"constant"
)
pred_tta
=
_execute
(
X_mag_pad
,
roi_size
,
n_window
,
device
,
model
,
aggressiveness
,
is_half
)
pred_tta
=
pred_tta
[:,
:,
roi_size
//
2
:]
pred_tta
=
pred_tta
[:,
:,
:
n_frame
]
return
(
pred
+
pred_tta
)
*
0.5
*
coef
,
X_mag
,
np
.
exp
(
1.0j
*
X_phase
)
else
:
return
pred
*
coef
,
X_mag
,
np
.
exp
(
1.0j
*
X_phase
)
def
_get_name_params
(
model_path
,
model_hash
):
data
=
load_data
()
flag
=
False
ModelName
=
model_path
for
type
in
list
(
data
):
for
model
in
list
(
data
[
type
][
0
]):
for
i
in
range
(
len
(
data
[
type
][
0
][
model
])):
if
str
(
data
[
type
][
0
][
model
][
i
][
"hash_name"
])
==
model_hash
:
flag
=
True
elif
str
(
data
[
type
][
0
][
model
][
i
][
"hash_name"
])
in
ModelName
:
flag
=
True
if
flag
:
model_params_auto
=
data
[
type
][
0
][
model
][
i
][
"model_params"
]
param_name_auto
=
data
[
type
][
0
][
model
][
i
][
"param_name"
]
if
type
==
"equivalent"
:
return
param_name_auto
,
model_params_auto
else
:
flag
=
False
return
param_name_auto
,
model_params_auto
infer/modules/ipex/__init__.py
0 → 100644
View file @
9867304a
import
os
import
sys
import
contextlib
import
torch
import
intel_extension_for_pytorch
as
ipex
# pylint: disable=import-error, unused-import
from
.hijacks
import
ipex_hijacks
from
.attention
import
attention_init
# pylint: disable=protected-access, missing-function-docstring, line-too-long
def
ipex_init
():
# pylint: disable=too-many-statements
try
:
# Replace cuda with xpu:
torch
.
cuda
.
current_device
=
torch
.
xpu
.
current_device
torch
.
cuda
.
current_stream
=
torch
.
xpu
.
current_stream
torch
.
cuda
.
device
=
torch
.
xpu
.
device
torch
.
cuda
.
device_count
=
torch
.
xpu
.
device_count
torch
.
cuda
.
device_of
=
torch
.
xpu
.
device_of
torch
.
cuda
.
get_device_name
=
torch
.
xpu
.
get_device_name
torch
.
cuda
.
get_device_properties
=
torch
.
xpu
.
get_device_properties
torch
.
cuda
.
init
=
torch
.
xpu
.
init
torch
.
cuda
.
is_available
=
torch
.
xpu
.
is_available
torch
.
cuda
.
is_initialized
=
torch
.
xpu
.
is_initialized
torch
.
cuda
.
is_current_stream_capturing
=
lambda
:
False
torch
.
cuda
.
set_device
=
torch
.
xpu
.
set_device
torch
.
cuda
.
stream
=
torch
.
xpu
.
stream
torch
.
cuda
.
synchronize
=
torch
.
xpu
.
synchronize
torch
.
cuda
.
Event
=
torch
.
xpu
.
Event
torch
.
cuda
.
Stream
=
torch
.
xpu
.
Stream
torch
.
cuda
.
FloatTensor
=
torch
.
xpu
.
FloatTensor
torch
.
Tensor
.
cuda
=
torch
.
Tensor
.
xpu
torch
.
Tensor
.
is_cuda
=
torch
.
Tensor
.
is_xpu
torch
.
cuda
.
_initialization_lock
=
torch
.
xpu
.
lazy_init
.
_initialization_lock
torch
.
cuda
.
_initialized
=
torch
.
xpu
.
lazy_init
.
_initialized
torch
.
cuda
.
_lazy_seed_tracker
=
torch
.
xpu
.
lazy_init
.
_lazy_seed_tracker
torch
.
cuda
.
_queued_calls
=
torch
.
xpu
.
lazy_init
.
_queued_calls
torch
.
cuda
.
_tls
=
torch
.
xpu
.
lazy_init
.
_tls
torch
.
cuda
.
threading
=
torch
.
xpu
.
lazy_init
.
threading
torch
.
cuda
.
traceback
=
torch
.
xpu
.
lazy_init
.
traceback
torch
.
cuda
.
Optional
=
torch
.
xpu
.
Optional
torch
.
cuda
.
__cached__
=
torch
.
xpu
.
__cached__
torch
.
cuda
.
__loader__
=
torch
.
xpu
.
__loader__
torch
.
cuda
.
ComplexFloatStorage
=
torch
.
xpu
.
ComplexFloatStorage
torch
.
cuda
.
Tuple
=
torch
.
xpu
.
Tuple
torch
.
cuda
.
streams
=
torch
.
xpu
.
streams
torch
.
cuda
.
_lazy_new
=
torch
.
xpu
.
_lazy_new
torch
.
cuda
.
FloatStorage
=
torch
.
xpu
.
FloatStorage
torch
.
cuda
.
Any
=
torch
.
xpu
.
Any
torch
.
cuda
.
__doc__
=
torch
.
xpu
.
__doc__
torch
.
cuda
.
default_generators
=
torch
.
xpu
.
default_generators
torch
.
cuda
.
HalfTensor
=
torch
.
xpu
.
HalfTensor
torch
.
cuda
.
_get_device_index
=
torch
.
xpu
.
_get_device_index
torch
.
cuda
.
__path__
=
torch
.
xpu
.
__path__
torch
.
cuda
.
Device
=
torch
.
xpu
.
Device
torch
.
cuda
.
IntTensor
=
torch
.
xpu
.
IntTensor
torch
.
cuda
.
ByteStorage
=
torch
.
xpu
.
ByteStorage
torch
.
cuda
.
set_stream
=
torch
.
xpu
.
set_stream
torch
.
cuda
.
BoolStorage
=
torch
.
xpu
.
BoolStorage
torch
.
cuda
.
os
=
torch
.
xpu
.
os
torch
.
cuda
.
torch
=
torch
.
xpu
.
torch
torch
.
cuda
.
BFloat16Storage
=
torch
.
xpu
.
BFloat16Storage
torch
.
cuda
.
Union
=
torch
.
xpu
.
Union
torch
.
cuda
.
DoubleTensor
=
torch
.
xpu
.
DoubleTensor
torch
.
cuda
.
ShortTensor
=
torch
.
xpu
.
ShortTensor
torch
.
cuda
.
LongTensor
=
torch
.
xpu
.
LongTensor
torch
.
cuda
.
IntStorage
=
torch
.
xpu
.
IntStorage
torch
.
cuda
.
LongStorage
=
torch
.
xpu
.
LongStorage
torch
.
cuda
.
__annotations__
=
torch
.
xpu
.
__annotations__
torch
.
cuda
.
__package__
=
torch
.
xpu
.
__package__
torch
.
cuda
.
__builtins__
=
torch
.
xpu
.
__builtins__
torch
.
cuda
.
CharTensor
=
torch
.
xpu
.
CharTensor
torch
.
cuda
.
List
=
torch
.
xpu
.
List
torch
.
cuda
.
_lazy_init
=
torch
.
xpu
.
_lazy_init
torch
.
cuda
.
BFloat16Tensor
=
torch
.
xpu
.
BFloat16Tensor
torch
.
cuda
.
DoubleStorage
=
torch
.
xpu
.
DoubleStorage
torch
.
cuda
.
ByteTensor
=
torch
.
xpu
.
ByteTensor
torch
.
cuda
.
StreamContext
=
torch
.
xpu
.
StreamContext
torch
.
cuda
.
ComplexDoubleStorage
=
torch
.
xpu
.
ComplexDoubleStorage
torch
.
cuda
.
ShortStorage
=
torch
.
xpu
.
ShortStorage
torch
.
cuda
.
_lazy_call
=
torch
.
xpu
.
_lazy_call
torch
.
cuda
.
HalfStorage
=
torch
.
xpu
.
HalfStorage
torch
.
cuda
.
random
=
torch
.
xpu
.
random
torch
.
cuda
.
_device
=
torch
.
xpu
.
_device
torch
.
cuda
.
classproperty
=
torch
.
xpu
.
classproperty
torch
.
cuda
.
__name__
=
torch
.
xpu
.
__name__
torch
.
cuda
.
_device_t
=
torch
.
xpu
.
_device_t
torch
.
cuda
.
warnings
=
torch
.
xpu
.
warnings
torch
.
cuda
.
__spec__
=
torch
.
xpu
.
__spec__
torch
.
cuda
.
BoolTensor
=
torch
.
xpu
.
BoolTensor
torch
.
cuda
.
CharStorage
=
torch
.
xpu
.
CharStorage
torch
.
cuda
.
__file__
=
torch
.
xpu
.
__file__
torch
.
cuda
.
_is_in_bad_fork
=
torch
.
xpu
.
lazy_init
.
_is_in_bad_fork
# torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
# Memory:
torch
.
cuda
.
memory
=
torch
.
xpu
.
memory
if
"linux"
in
sys
.
platform
and
"WSL2"
in
os
.
popen
(
"uname -a"
).
read
():
torch
.
xpu
.
empty_cache
=
lambda
:
None
torch
.
cuda
.
empty_cache
=
torch
.
xpu
.
empty_cache
torch
.
cuda
.
memory_stats
=
torch
.
xpu
.
memory_stats
torch
.
cuda
.
memory_summary
=
torch
.
xpu
.
memory_summary
torch
.
cuda
.
memory_snapshot
=
torch
.
xpu
.
memory_snapshot
torch
.
cuda
.
memory_allocated
=
torch
.
xpu
.
memory_allocated
torch
.
cuda
.
max_memory_allocated
=
torch
.
xpu
.
max_memory_allocated
torch
.
cuda
.
memory_reserved
=
torch
.
xpu
.
memory_reserved
torch
.
cuda
.
memory_cached
=
torch
.
xpu
.
memory_reserved
torch
.
cuda
.
max_memory_reserved
=
torch
.
xpu
.
max_memory_reserved
torch
.
cuda
.
max_memory_cached
=
torch
.
xpu
.
max_memory_reserved
torch
.
cuda
.
reset_peak_memory_stats
=
torch
.
xpu
.
reset_peak_memory_stats
torch
.
cuda
.
reset_max_memory_cached
=
torch
.
xpu
.
reset_peak_memory_stats
torch
.
cuda
.
reset_max_memory_allocated
=
torch
.
xpu
.
reset_peak_memory_stats
torch
.
cuda
.
memory_stats_as_nested_dict
=
torch
.
xpu
.
memory_stats_as_nested_dict
torch
.
cuda
.
reset_accumulated_memory_stats
=
(
torch
.
xpu
.
reset_accumulated_memory_stats
)
# RNG:
torch
.
cuda
.
get_rng_state
=
torch
.
xpu
.
get_rng_state
torch
.
cuda
.
get_rng_state_all
=
torch
.
xpu
.
get_rng_state_all
torch
.
cuda
.
set_rng_state
=
torch
.
xpu
.
set_rng_state
torch
.
cuda
.
set_rng_state_all
=
torch
.
xpu
.
set_rng_state_all
torch
.
cuda
.
manual_seed
=
torch
.
xpu
.
manual_seed
torch
.
cuda
.
manual_seed_all
=
torch
.
xpu
.
manual_seed_all
torch
.
cuda
.
seed
=
torch
.
xpu
.
seed
torch
.
cuda
.
seed_all
=
torch
.
xpu
.
seed_all
torch
.
cuda
.
initial_seed
=
torch
.
xpu
.
initial_seed
# AMP:
torch
.
cuda
.
amp
=
torch
.
xpu
.
amp
if
not
hasattr
(
torch
.
cuda
.
amp
,
"common"
):
torch
.
cuda
.
amp
.
common
=
contextlib
.
nullcontext
()
torch
.
cuda
.
amp
.
common
.
amp_definitely_not_available
=
lambda
:
False
try
:
torch
.
cuda
.
amp
.
GradScaler
=
torch
.
xpu
.
amp
.
GradScaler
except
Exception
:
# pylint: disable=broad-exception-caught
try
:
from
.gradscaler
import
(
gradscaler_init
,
)
# pylint: disable=import-outside-toplevel, import-error
gradscaler_init
()
torch
.
cuda
.
amp
.
GradScaler
=
torch
.
xpu
.
amp
.
GradScaler
except
Exception
:
# pylint: disable=broad-exception-caught
torch
.
cuda
.
amp
.
GradScaler
=
ipex
.
cpu
.
autocast
.
_grad_scaler
.
GradScaler
# C
torch
.
_C
.
_cuda_getCurrentRawStream
=
ipex
.
_C
.
_getCurrentStream
ipex
.
_C
.
_DeviceProperties
.
major
=
2023
ipex
.
_C
.
_DeviceProperties
.
minor
=
2
# Fix functions with ipex:
torch
.
cuda
.
mem_get_info
=
lambda
device
=
None
:
[
(
torch
.
xpu
.
get_device_properties
(
device
).
total_memory
-
torch
.
xpu
.
memory_allocated
(
device
)
),
torch
.
xpu
.
get_device_properties
(
device
).
total_memory
,
]
torch
.
_utils
.
_get_available_device_type
=
lambda
:
"xpu"
torch
.
has_cuda
=
True
torch
.
cuda
.
has_half
=
True
torch
.
cuda
.
is_bf16_supported
=
lambda
*
args
,
**
kwargs
:
True
torch
.
cuda
.
is_fp16_supported
=
lambda
*
args
,
**
kwargs
:
True
torch
.
version
.
cuda
=
"11.7"
torch
.
cuda
.
get_device_capability
=
lambda
*
args
,
**
kwargs
:
[
11
,
7
]
torch
.
cuda
.
get_device_properties
.
major
=
11
torch
.
cuda
.
get_device_properties
.
minor
=
7
torch
.
cuda
.
ipc_collect
=
lambda
*
args
,
**
kwargs
:
None
torch
.
cuda
.
utilization
=
lambda
*
args
,
**
kwargs
:
0
if
hasattr
(
torch
.
xpu
,
"getDeviceIdListForCard"
):
torch
.
cuda
.
getDeviceIdListForCard
=
torch
.
xpu
.
getDeviceIdListForCard
torch
.
cuda
.
get_device_id_list_per_card
=
torch
.
xpu
.
getDeviceIdListForCard
else
:
torch
.
cuda
.
getDeviceIdListForCard
=
torch
.
xpu
.
get_device_id_list_per_card
torch
.
cuda
.
get_device_id_list_per_card
=
(
torch
.
xpu
.
get_device_id_list_per_card
)
ipex_hijacks
()
attention_init
()
try
:
from
.diffusers
import
ipex_diffusers
ipex_diffusers
()
except
Exception
:
# pylint: disable=broad-exception-caught
pass
except
Exception
as
e
:
return
False
,
e
return
True
,
None
infer/modules/ipex/attention.py
0 → 100644
View file @
9867304a
import
torch
import
intel_extension_for_pytorch
as
ipex
# pylint: disable=import-error, unused-import
# pylint: disable=protected-access, missing-function-docstring, line-too-long
original_torch_bmm
=
torch
.
bmm
def
torch_bmm
(
input
,
mat2
,
*
,
out
=
None
):
if
input
.
dtype
!=
mat2
.
dtype
:
mat2
=
mat2
.
to
(
input
.
dtype
)
# ARC GPUs can't allocate more than 4GB to a single block, Slice it:
batch_size_attention
,
input_tokens
,
mat2_shape
=
(
input
.
shape
[
0
],
input
.
shape
[
1
],
mat2
.
shape
[
2
],
)
block_multiply
=
input
.
element_size
()
slice_block_size
=
input_tokens
*
mat2_shape
/
1024
/
1024
*
block_multiply
block_size
=
batch_size_attention
*
slice_block_size
split_slice_size
=
batch_size_attention
if
block_size
>
4
:
do_split
=
True
# Find something divisible with the input_tokens
while
(
split_slice_size
*
slice_block_size
)
>
4
:
split_slice_size
=
split_slice_size
//
2
if
split_slice_size
<=
1
:
split_slice_size
=
1
break
else
:
do_split
=
False
split_2_slice_size
=
input_tokens
if
split_slice_size
*
slice_block_size
>
4
:
slice_block_size2
=
split_slice_size
*
mat2_shape
/
1024
/
1024
*
block_multiply
do_split_2
=
True
# Find something divisible with the input_tokens
while
(
split_2_slice_size
*
slice_block_size2
)
>
4
:
split_2_slice_size
=
split_2_slice_size
//
2
if
split_2_slice_size
<=
1
:
split_2_slice_size
=
1
break
else
:
do_split_2
=
False
if
do_split
:
hidden_states
=
torch
.
zeros
(
input
.
shape
[
0
],
input
.
shape
[
1
],
mat2
.
shape
[
2
],
device
=
input
.
device
,
dtype
=
input
.
dtype
,
)
for
i
in
range
(
batch_size_attention
//
split_slice_size
):
start_idx
=
i
*
split_slice_size
end_idx
=
(
i
+
1
)
*
split_slice_size
if
do_split_2
:
for
i2
in
range
(
input_tokens
//
split_2_slice_size
):
# pylint: disable=invalid-name
start_idx_2
=
i2
*
split_2_slice_size
end_idx_2
=
(
i2
+
1
)
*
split_2_slice_size
hidden_states
[
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
]
=
(
original_torch_bmm
(
input
[
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
],
mat2
[
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
],
out
=
out
,
)
)
else
:
hidden_states
[
start_idx
:
end_idx
]
=
original_torch_bmm
(
input
[
start_idx
:
end_idx
],
mat2
[
start_idx
:
end_idx
],
out
=
out
)
else
:
return
original_torch_bmm
(
input
,
mat2
,
out
=
out
)
return
hidden_states
original_scaled_dot_product_attention
=
torch
.
nn
.
functional
.
scaled_dot_product_attention
def
scaled_dot_product_attention
(
query
,
key
,
value
,
attn_mask
=
None
,
dropout_p
=
0.0
,
is_causal
=
False
):
# ARC GPUs can't allocate more than 4GB to a single block, Slice it:
if
len
(
query
.
shape
)
==
3
:
batch_size_attention
,
query_tokens
,
shape_four
=
query
.
shape
shape_one
=
1
no_shape_one
=
True
else
:
shape_one
,
batch_size_attention
,
query_tokens
,
shape_four
=
query
.
shape
no_shape_one
=
False
block_multiply
=
query
.
element_size
()
slice_block_size
=
(
shape_one
*
query_tokens
*
shape_four
/
1024
/
1024
*
block_multiply
)
block_size
=
batch_size_attention
*
slice_block_size
split_slice_size
=
batch_size_attention
if
block_size
>
4
:
do_split
=
True
# Find something divisible with the shape_one
while
(
split_slice_size
*
slice_block_size
)
>
4
:
split_slice_size
=
split_slice_size
//
2
if
split_slice_size
<=
1
:
split_slice_size
=
1
break
else
:
do_split
=
False
split_2_slice_size
=
query_tokens
if
split_slice_size
*
slice_block_size
>
4
:
slice_block_size2
=
(
shape_one
*
split_slice_size
*
shape_four
/
1024
/
1024
*
block_multiply
)
do_split_2
=
True
# Find something divisible with the batch_size_attention
while
(
split_2_slice_size
*
slice_block_size2
)
>
4
:
split_2_slice_size
=
split_2_slice_size
//
2
if
split_2_slice_size
<=
1
:
split_2_slice_size
=
1
break
else
:
do_split_2
=
False
if
do_split
:
hidden_states
=
torch
.
zeros
(
query
.
shape
,
device
=
query
.
device
,
dtype
=
query
.
dtype
)
for
i
in
range
(
batch_size_attention
//
split_slice_size
):
start_idx
=
i
*
split_slice_size
end_idx
=
(
i
+
1
)
*
split_slice_size
if
do_split_2
:
for
i2
in
range
(
query_tokens
//
split_2_slice_size
):
# pylint: disable=invalid-name
start_idx_2
=
i2
*
split_2_slice_size
end_idx_2
=
(
i2
+
1
)
*
split_2_slice_size
if
no_shape_one
:
hidden_states
[
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
]
=
(
original_scaled_dot_product_attention
(
query
[
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
],
key
[
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
],
value
[
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
],
attn_mask
=
(
attn_mask
[
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
]
if
attn_mask
is
not
None
else
attn_mask
),
dropout_p
=
dropout_p
,
is_causal
=
is_causal
,
)
)
else
:
hidden_states
[:,
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
]
=
(
original_scaled_dot_product_attention
(
query
[:,
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
],
key
[:,
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
],
value
[:,
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
],
attn_mask
=
(
attn_mask
[
:,
start_idx
:
end_idx
,
start_idx_2
:
end_idx_2
]
if
attn_mask
is
not
None
else
attn_mask
),
dropout_p
=
dropout_p
,
is_causal
=
is_causal
,
)
)
else
:
if
no_shape_one
:
hidden_states
[
start_idx
:
end_idx
]
=
(
original_scaled_dot_product_attention
(
query
[
start_idx
:
end_idx
],
key
[
start_idx
:
end_idx
],
value
[
start_idx
:
end_idx
],
attn_mask
=
(
attn_mask
[
start_idx
:
end_idx
]
if
attn_mask
is
not
None
else
attn_mask
),
dropout_p
=
dropout_p
,
is_causal
=
is_causal
,
)
)
else
:
hidden_states
[:,
start_idx
:
end_idx
]
=
(
original_scaled_dot_product_attention
(
query
[:,
start_idx
:
end_idx
],
key
[:,
start_idx
:
end_idx
],
value
[:,
start_idx
:
end_idx
],
attn_mask
=
(
attn_mask
[:,
start_idx
:
end_idx
]
if
attn_mask
is
not
None
else
attn_mask
),
dropout_p
=
dropout_p
,
is_causal
=
is_causal
,
)
)
else
:
return
original_scaled_dot_product_attention
(
query
,
key
,
value
,
attn_mask
=
attn_mask
,
dropout_p
=
dropout_p
,
is_causal
=
is_causal
,
)
return
hidden_states
def
attention_init
():
# ARC GPUs can't allocate more than 4GB to a single block:
torch
.
bmm
=
torch_bmm
torch
.
nn
.
functional
.
scaled_dot_product_attention
=
scaled_dot_product_attention
infer/modules/ipex/gradscaler.py
0 → 100644
View file @
9867304a
from
collections
import
defaultdict
import
torch
import
intel_extension_for_pytorch
as
ipex
# pylint: disable=import-error, unused-import
import
intel_extension_for_pytorch._C
as
core
# pylint: disable=import-error, unused-import
# pylint: disable=protected-access, missing-function-docstring, line-too-long
OptState
=
ipex
.
cpu
.
autocast
.
_grad_scaler
.
OptState
_MultiDeviceReplicator
=
ipex
.
cpu
.
autocast
.
_grad_scaler
.
_MultiDeviceReplicator
_refresh_per_optimizer_state
=
(
ipex
.
cpu
.
autocast
.
_grad_scaler
.
_refresh_per_optimizer_state
)
def
_unscale_grads_
(
self
,
optimizer
,
inv_scale
,
found_inf
,
allow_fp16
):
# pylint: disable=unused-argument
per_device_inv_scale
=
_MultiDeviceReplicator
(
inv_scale
)
per_device_found_inf
=
_MultiDeviceReplicator
(
found_inf
)
# To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
# There could be hundreds of grads, so we'd like to iterate through them just once.
# However, we don't know their devices or dtypes in advance.
# https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
# Google says mypy struggles with defaultdicts type annotations.
per_device_and_dtype_grads
=
defaultdict
(
lambda
:
defaultdict
(
list
))
# type: ignore[var-annotated]
# sync grad to master weight
if
hasattr
(
optimizer
,
"sync_grad"
):
optimizer
.
sync_grad
()
with
torch
.
no_grad
():
for
group
in
optimizer
.
param_groups
:
for
param
in
group
[
"params"
]:
if
param
.
grad
is
None
:
continue
if
(
not
allow_fp16
)
and
param
.
grad
.
dtype
==
torch
.
float16
:
raise
ValueError
(
"Attempting to unscale FP16 gradients."
)
if
param
.
grad
.
is_sparse
:
# is_coalesced() == False means the sparse grad has values with duplicate indices.
# coalesce() deduplicates indices and adds all values that have the same index.
# For scaled fp16 values, there's a good chance coalescing will cause overflow,
# so we should check the coalesced _values().
if
param
.
grad
.
dtype
is
torch
.
float16
:
param
.
grad
=
param
.
grad
.
coalesce
()
to_unscale
=
param
.
grad
.
_values
()
else
:
to_unscale
=
param
.
grad
# -: is there a way to split by device and dtype without appending in the inner loop?
to_unscale
=
to_unscale
.
to
(
"cpu"
)
per_device_and_dtype_grads
[
to_unscale
.
device
][
to_unscale
.
dtype
].
append
(
to_unscale
)
for
_
,
per_dtype_grads
in
per_device_and_dtype_grads
.
items
():
for
grads
in
per_dtype_grads
.
values
():
core
.
_amp_foreach_non_finite_check_and_unscale_
(
grads
,
per_device_found_inf
.
get
(
"cpu"
),
per_device_inv_scale
.
get
(
"cpu"
),
)
return
per_device_found_inf
.
_per_device_tensors
def
unscale_
(
self
,
optimizer
):
"""
Divides ("unscales") the optimizer's gradient tensors by the scale factor.
:meth:`unscale_` is optional, serving cases where you need to
:ref:`modify or inspect gradients<working-with-unscaled-gradients>`
between the backward pass(es) and :meth:`step`.
If :meth:`unscale_` is not called explicitly, gradients will be unscaled automatically during :meth:`step`.
Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
...
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
scaler.step(optimizer)
scaler.update()
Args:
optimizer (torch.optim.Optimizer): Optimizer that owns the gradients to be unscaled.
.. warning::
:meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
and only after all gradients for that optimizer's assigned parameters have been accumulated.
Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
.. warning::
:meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
"""
if
not
self
.
_enabled
:
return
self
.
_check_scale_growth_tracker
(
"unscale_"
)
optimizer_state
=
self
.
_per_optimizer_states
[
id
(
optimizer
)]
if
optimizer_state
[
"stage"
]
is
OptState
.
UNSCALED
:
# pylint: disable=no-else-raise
raise
RuntimeError
(
"unscale_() has already been called on this optimizer since the last update()."
)
elif
optimizer_state
[
"stage"
]
is
OptState
.
STEPPED
:
raise
RuntimeError
(
"unscale_() is being called after step()."
)
# FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
assert
self
.
_scale
is
not
None
inv_scale
=
(
self
.
_scale
.
to
(
"cpu"
).
double
().
reciprocal
().
float
().
to
(
self
.
_scale
.
device
)
)
found_inf
=
torch
.
full
((
1
,),
0.0
,
dtype
=
torch
.
float32
,
device
=
self
.
_scale
.
device
)
optimizer_state
[
"found_inf_per_device"
]
=
self
.
_unscale_grads_
(
optimizer
,
inv_scale
,
found_inf
,
False
)
optimizer_state
[
"stage"
]
=
OptState
.
UNSCALED
def
update
(
self
,
new_scale
=
None
):
"""
Updates the scale factor.
If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
the scale is multiplied by ``growth_factor`` to increase it.
Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
used directly, it's used to fill GradScaler's internal scale tensor. So if
``new_scale`` was a tensor, later in-place changes to that tensor will not further
affect the scale GradScaler uses internally.)
Args:
new_scale (float or :class:`torch.FloatTensor`, optional, default=None): New scale factor.
.. warning::
:meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
been invoked for all optimizers used this iteration.
"""
if
not
self
.
_enabled
:
return
_scale
,
_growth_tracker
=
self
.
_check_scale_growth_tracker
(
"update"
)
if
new_scale
is
not
None
:
# Accept a new user-defined scale.
if
isinstance
(
new_scale
,
float
):
self
.
_scale
.
fill_
(
new_scale
)
# type: ignore[union-attr]
else
:
reason
=
"new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False."
assert
isinstance
(
new_scale
,
torch
.
FloatTensor
),
reason
# type: ignore[attr-defined]
assert
new_scale
.
numel
()
==
1
,
reason
assert
new_scale
.
requires_grad
is
False
,
reason
self
.
_scale
.
copy_
(
new_scale
)
# type: ignore[union-attr]
else
:
# Consume shared inf/nan data collected from optimizers to update the scale.
# If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
found_infs
=
[
found_inf
.
to
(
device
=
"cpu"
,
non_blocking
=
True
)
for
state
in
self
.
_per_optimizer_states
.
values
()
for
found_inf
in
state
[
"found_inf_per_device"
].
values
()
]
assert
len
(
found_infs
)
>
0
,
"No inf checks were recorded prior to update."
found_inf_combined
=
found_infs
[
0
]
if
len
(
found_infs
)
>
1
:
for
i
in
range
(
1
,
len
(
found_infs
)):
found_inf_combined
+=
found_infs
[
i
]
to_device
=
_scale
.
device
_scale
=
_scale
.
to
(
"cpu"
)
_growth_tracker
=
_growth_tracker
.
to
(
"cpu"
)
core
.
_amp_update_scale_
(
_scale
,
_growth_tracker
,
found_inf_combined
,
self
.
_growth_factor
,
self
.
_backoff_factor
,
self
.
_growth_interval
,
)
_scale
=
_scale
.
to
(
to_device
)
_growth_tracker
=
_growth_tracker
.
to
(
to_device
)
# To prepare for next iteration, clear the data collected from optimizers this iteration.
self
.
_per_optimizer_states
=
defaultdict
(
_refresh_per_optimizer_state
)
def
gradscaler_init
():
torch
.
xpu
.
amp
.
GradScaler
=
ipex
.
cpu
.
autocast
.
_grad_scaler
.
GradScaler
torch
.
xpu
.
amp
.
GradScaler
.
_unscale_grads_
=
_unscale_grads_
torch
.
xpu
.
amp
.
GradScaler
.
unscale_
=
unscale_
torch
.
xpu
.
amp
.
GradScaler
.
update
=
update
return
torch
.
xpu
.
amp
.
GradScaler
infer/modules/ipex/hijacks.py
0 → 100644
View file @
9867304a
import
contextlib
import
importlib
import
torch
import
intel_extension_for_pytorch
as
ipex
# pylint: disable=import-error, unused-import
# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return
class
CondFunc
:
# pylint: disable=missing-class-docstring
def
__new__
(
cls
,
orig_func
,
sub_func
,
cond_func
):
self
=
super
(
CondFunc
,
cls
).
__new__
(
cls
)
if
isinstance
(
orig_func
,
str
):
func_path
=
orig_func
.
split
(
"."
)
for
i
in
range
(
len
(
func_path
)
-
1
,
-
1
,
-
1
):
try
:
resolved_obj
=
importlib
.
import_module
(
"."
.
join
(
func_path
[:
i
]))
break
except
ImportError
:
pass
for
attr_name
in
func_path
[
i
:
-
1
]:
resolved_obj
=
getattr
(
resolved_obj
,
attr_name
)
orig_func
=
getattr
(
resolved_obj
,
func_path
[
-
1
])
setattr
(
resolved_obj
,
func_path
[
-
1
],
lambda
*
args
,
**
kwargs
:
self
(
*
args
,
**
kwargs
),
)
self
.
__init__
(
orig_func
,
sub_func
,
cond_func
)
return
lambda
*
args
,
**
kwargs
:
self
(
*
args
,
**
kwargs
)
def
__init__
(
self
,
orig_func
,
sub_func
,
cond_func
):
self
.
__orig_func
=
orig_func
self
.
__sub_func
=
sub_func
self
.
__cond_func
=
cond_func
def
__call__
(
self
,
*
args
,
**
kwargs
):
if
not
self
.
__cond_func
or
self
.
__cond_func
(
self
.
__orig_func
,
*
args
,
**
kwargs
):
return
self
.
__sub_func
(
self
.
__orig_func
,
*
args
,
**
kwargs
)
else
:
return
self
.
__orig_func
(
*
args
,
**
kwargs
)
_utils
=
torch
.
utils
.
data
.
_utils
def
_shutdown_workers
(
self
):
if
(
torch
.
utils
.
data
.
_utils
is
None
or
torch
.
utils
.
data
.
_utils
.
python_exit_status
is
True
or
torch
.
utils
.
data
.
_utils
.
python_exit_status
is
None
):
return
if
hasattr
(
self
,
"_shutdown"
)
and
not
self
.
_shutdown
:
self
.
_shutdown
=
True
try
:
if
hasattr
(
self
,
"_pin_memory_thread"
):
self
.
_pin_memory_thread_done_event
.
set
()
self
.
_worker_result_queue
.
put
((
None
,
None
))
self
.
_pin_memory_thread
.
join
()
self
.
_worker_result_queue
.
cancel_join_thread
()
self
.
_worker_result_queue
.
close
()
self
.
_workers_done_event
.
set
()
for
worker_id
in
range
(
len
(
self
.
_workers
)):
if
self
.
_persistent_workers
or
self
.
_workers_status
[
worker_id
]:
self
.
_mark_worker_as_unavailable
(
worker_id
,
shutdown
=
True
)
for
w
in
self
.
_workers
:
# pylint: disable=invalid-name
w
.
join
(
timeout
=
torch
.
utils
.
data
.
_utils
.
MP_STATUS_CHECK_INTERVAL
)
for
q
in
self
.
_index_queues
:
# pylint: disable=invalid-name
q
.
cancel_join_thread
()
q
.
close
()
finally
:
if
self
.
_worker_pids_set
:
torch
.
utils
.
data
.
_utils
.
signal_handling
.
_remove_worker_pids
(
id
(
self
))
self
.
_worker_pids_set
=
False
for
w
in
self
.
_workers
:
# pylint: disable=invalid-name
if
w
.
is_alive
():
w
.
terminate
()
class
DummyDataParallel
(
torch
.
nn
.
Module
):
# pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods
def
__new__
(
cls
,
module
,
device_ids
=
None
,
output_device
=
None
,
dim
=
0
):
# pylint: disable=unused-argument
if
isinstance
(
device_ids
,
list
)
and
len
(
device_ids
)
>
1
:
print
(
"IPEX backend doesn't support DataParallel on multiple XPU devices"
)
return
module
.
to
(
"xpu"
)
def
return_null_context
(
*
args
,
**
kwargs
):
# pylint: disable=unused-argument
return
contextlib
.
nullcontext
()
def
check_device
(
device
):
return
bool
(
(
isinstance
(
device
,
torch
.
device
)
and
device
.
type
==
"cuda"
)
or
(
isinstance
(
device
,
str
)
and
"cuda"
in
device
)
or
isinstance
(
device
,
int
)
)
def
return_xpu
(
device
):
return
(
f
"xpu:
{
device
[
-
1
]
}
"
if
isinstance
(
device
,
str
)
and
":"
in
device
else
(
f
"xpu:
{
device
}
"
if
isinstance
(
device
,
int
)
else
torch
.
device
(
"xpu"
)
if
isinstance
(
device
,
torch
.
device
)
else
"xpu"
)
)
def
ipex_no_cuda
(
orig_func
,
*
args
,
**
kwargs
):
torch
.
cuda
.
is_available
=
lambda
:
False
orig_func
(
*
args
,
**
kwargs
)
torch
.
cuda
.
is_available
=
torch
.
xpu
.
is_available
original_autocast
=
torch
.
autocast
def
ipex_autocast
(
*
args
,
**
kwargs
):
if
len
(
args
)
>
0
and
args
[
0
]
==
"cuda"
:
return
original_autocast
(
"xpu"
,
*
args
[
1
:],
**
kwargs
)
else
:
return
original_autocast
(
*
args
,
**
kwargs
)
original_torch_cat
=
torch
.
cat
def
torch_cat
(
tensor
,
*
args
,
**
kwargs
):
if
len
(
tensor
)
==
3
and
(
tensor
[
0
].
dtype
!=
tensor
[
1
].
dtype
or
tensor
[
2
].
dtype
!=
tensor
[
1
].
dtype
):
return
original_torch_cat
(
[
tensor
[
0
].
to
(
tensor
[
1
].
dtype
),
tensor
[
1
],
tensor
[
2
].
to
(
tensor
[
1
].
dtype
)],
*
args
,
**
kwargs
,
)
else
:
return
original_torch_cat
(
tensor
,
*
args
,
**
kwargs
)
original_interpolate
=
torch
.
nn
.
functional
.
interpolate
def
interpolate
(
tensor
,
size
=
None
,
scale_factor
=
None
,
mode
=
"nearest"
,
align_corners
=
None
,
recompute_scale_factor
=
None
,
antialias
=
False
,
):
# pylint: disable=too-many-arguments
if
antialias
or
align_corners
is
not
None
:
return_device
=
tensor
.
device
return_dtype
=
tensor
.
dtype
return
original_interpolate
(
tensor
.
to
(
"cpu"
,
dtype
=
torch
.
float32
),
size
=
size
,
scale_factor
=
scale_factor
,
mode
=
mode
,
align_corners
=
align_corners
,
recompute_scale_factor
=
recompute_scale_factor
,
antialias
=
antialias
,
).
to
(
return_device
,
dtype
=
return_dtype
)
else
:
return
original_interpolate
(
tensor
,
size
=
size
,
scale_factor
=
scale_factor
,
mode
=
mode
,
align_corners
=
align_corners
,
recompute_scale_factor
=
recompute_scale_factor
,
antialias
=
antialias
,
)
original_linalg_solve
=
torch
.
linalg
.
solve
def
linalg_solve
(
A
,
B
,
*
args
,
**
kwargs
):
# pylint: disable=invalid-name
if
A
.
device
!=
torch
.
device
(
"cpu"
)
or
B
.
device
!=
torch
.
device
(
"cpu"
):
return_device
=
A
.
device
return
original_linalg_solve
(
A
.
to
(
"cpu"
),
B
.
to
(
"cpu"
),
*
args
,
**
kwargs
).
to
(
return_device
)
else
:
return
original_linalg_solve
(
A
,
B
,
*
args
,
**
kwargs
)
def
ipex_hijacks
():
CondFunc
(
"torch.Tensor.to"
,
lambda
orig_func
,
self
,
device
=
None
,
*
args
,
**
kwargs
:
orig_func
(
self
,
return_xpu
(
device
),
*
args
,
**
kwargs
),
lambda
orig_func
,
self
,
device
=
None
,
*
args
,
**
kwargs
:
check_device
(
device
),
)
CondFunc
(
"torch.Tensor.cuda"
,
lambda
orig_func
,
self
,
device
=
None
,
*
args
,
**
kwargs
:
orig_func
(
self
,
return_xpu
(
device
),
*
args
,
**
kwargs
),
lambda
orig_func
,
self
,
device
=
None
,
*
args
,
**
kwargs
:
check_device
(
device
),
)
CondFunc
(
"torch.empty"
,
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
orig_func
(
*
args
,
device
=
return_xpu
(
device
),
**
kwargs
),
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
check_device
(
device
),
)
CondFunc
(
"torch.load"
,
lambda
orig_func
,
*
args
,
map_location
=
None
,
**
kwargs
:
orig_func
(
*
args
,
return_xpu
(
map_location
),
**
kwargs
),
lambda
orig_func
,
*
args
,
map_location
=
None
,
**
kwargs
:
map_location
is
None
or
check_device
(
map_location
),
)
CondFunc
(
"torch.randn"
,
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
orig_func
(
*
args
,
device
=
return_xpu
(
device
),
**
kwargs
),
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
check_device
(
device
),
)
CondFunc
(
"torch.ones"
,
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
orig_func
(
*
args
,
device
=
return_xpu
(
device
),
**
kwargs
),
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
check_device
(
device
),
)
CondFunc
(
"torch.zeros"
,
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
orig_func
(
*
args
,
device
=
return_xpu
(
device
),
**
kwargs
),
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
check_device
(
device
),
)
CondFunc
(
"torch.tensor"
,
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
orig_func
(
*
args
,
device
=
return_xpu
(
device
),
**
kwargs
),
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
check_device
(
device
),
)
CondFunc
(
"torch.linspace"
,
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
orig_func
(
*
args
,
device
=
return_xpu
(
device
),
**
kwargs
),
lambda
orig_func
,
*
args
,
device
=
None
,
**
kwargs
:
check_device
(
device
),
)
CondFunc
(
"torch.Generator"
,
lambda
orig_func
,
device
=
None
:
torch
.
xpu
.
Generator
(
device
),
lambda
orig_func
,
device
=
None
:
device
is
not
None
and
device
!=
torch
.
device
(
"cpu"
)
and
device
!=
"cpu"
,
)
CondFunc
(
"torch.batch_norm"
,
lambda
orig_func
,
input
,
weight
,
bias
,
*
args
,
**
kwargs
:
orig_func
(
input
,
(
weight
if
weight
is
not
None
else
torch
.
ones
(
input
.
size
()[
1
],
device
=
input
.
device
)
),
(
bias
if
bias
is
not
None
else
torch
.
zeros
(
input
.
size
()[
1
],
device
=
input
.
device
)
),
*
args
,
**
kwargs
,
),
lambda
orig_func
,
input
,
*
args
,
**
kwargs
:
input
.
device
!=
torch
.
device
(
"cpu"
),
)
CondFunc
(
"torch.instance_norm"
,
lambda
orig_func
,
input
,
weight
,
bias
,
*
args
,
**
kwargs
:
orig_func
(
input
,
(
weight
if
weight
is
not
None
else
torch
.
ones
(
input
.
size
()[
1
],
device
=
input
.
device
)
),
(
bias
if
bias
is
not
None
else
torch
.
zeros
(
input
.
size
()[
1
],
device
=
input
.
device
)
),
*
args
,
**
kwargs
,
),
lambda
orig_func
,
input
,
*
args
,
**
kwargs
:
input
.
device
!=
torch
.
device
(
"cpu"
),
)
# Functions with dtype errors:
CondFunc
(
"torch.nn.modules.GroupNorm.forward"
,
lambda
orig_func
,
self
,
input
:
orig_func
(
self
,
input
.
to
(
self
.
weight
.
data
.
dtype
)
),
lambda
orig_func
,
self
,
input
:
input
.
dtype
!=
self
.
weight
.
data
.
dtype
,
)
CondFunc
(
"torch.nn.modules.linear.Linear.forward"
,
lambda
orig_func
,
self
,
input
:
orig_func
(
self
,
input
.
to
(
self
.
weight
.
data
.
dtype
)
),
lambda
orig_func
,
self
,
input
:
input
.
dtype
!=
self
.
weight
.
data
.
dtype
,
)
CondFunc
(
"torch.nn.modules.conv.Conv2d.forward"
,
lambda
orig_func
,
self
,
input
:
orig_func
(
self
,
input
.
to
(
self
.
weight
.
data
.
dtype
)
),
lambda
orig_func
,
self
,
input
:
input
.
dtype
!=
self
.
weight
.
data
.
dtype
,
)
CondFunc
(
"torch.nn.functional.layer_norm"
,
lambda
orig_func
,
input
,
normalized_shape
=
None
,
weight
=
None
,
*
args
,
**
kwargs
:
orig_func
(
input
.
to
(
weight
.
data
.
dtype
),
normalized_shape
,
weight
,
*
args
,
**
kwargs
),
lambda
orig_func
,
input
,
normalized_shape
=
None
,
weight
=
None
,
*
args
,
**
kwargs
:
weight
is
not
None
and
input
.
dtype
!=
weight
.
data
.
dtype
,
)
# Diffusers Float64 (ARC GPUs doesn't support double or Float64):
if
not
torch
.
xpu
.
has_fp64_dtype
():
CondFunc
(
"torch.from_numpy"
,
lambda
orig_func
,
ndarray
:
orig_func
(
ndarray
.
astype
(
"float32"
)),
lambda
orig_func
,
ndarray
:
ndarray
.
dtype
==
float
,
)
# Broken functions when torch.cuda.is_available is True:
CondFunc
(
"torch.utils.data.dataloader._BaseDataLoaderIter.__init__"
,
lambda
orig_func
,
*
args
,
**
kwargs
:
ipex_no_cuda
(
orig_func
,
*
args
,
**
kwargs
),
lambda
orig_func
,
*
args
,
**
kwargs
:
True
,
)
# Functions that make compile mad with CondFunc:
torch
.
utils
.
data
.
dataloader
.
_MultiProcessingDataLoaderIter
.
_shutdown_workers
=
(
_shutdown_workers
)
torch
.
nn
.
DataParallel
=
DummyDataParallel
torch
.
autocast
=
ipex_autocast
torch
.
cat
=
torch_cat
torch
.
linalg
.
solve
=
linalg_solve
torch
.
nn
.
functional
.
interpolate
=
interpolate
torch
.
backends
.
cuda
.
sdp_kernel
=
return_null_context
infer/modules/onnx/export.py
0 → 100644
View file @
9867304a
import
torch
from
infer.lib.infer_pack.models_onnx
import
SynthesizerTrnMsNSFsidM
def
export_onnx
(
ModelPath
,
ExportedPath
):
cpt
=
torch
.
load
(
ModelPath
,
map_location
=
"cpu"
)
cpt
[
"config"
][
-
3
]
=
cpt
[
"weight"
][
"emb_g.weight"
].
shape
[
0
]
vec_channels
=
256
if
cpt
.
get
(
"version"
,
"v1"
)
==
"v1"
else
768
test_phone
=
torch
.
rand
(
1
,
200
,
vec_channels
)
# hidden unit
test_phone_lengths
=
torch
.
tensor
([
200
]).
long
()
# hidden unit 长度(貌似没啥用)
test_pitch
=
torch
.
randint
(
size
=
(
1
,
200
),
low
=
5
,
high
=
255
)
# 基频(单位赫兹)
test_pitchf
=
torch
.
rand
(
1
,
200
)
# nsf基频
test_ds
=
torch
.
LongTensor
([
0
])
# 说话人ID
test_rnd
=
torch
.
rand
(
1
,
192
,
200
)
# 噪声(加入随机因子)
device
=
"cpu"
# 导出时设备(不影响使用模型)
net_g
=
SynthesizerTrnMsNSFsidM
(
*
cpt
[
"config"
],
is_half
=
False
,
version
=
cpt
.
get
(
"version"
,
"v1"
)
)
# fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
net_g
.
load_state_dict
(
cpt
[
"weight"
],
strict
=
False
)
input_names
=
[
"phone"
,
"phone_lengths"
,
"pitch"
,
"pitchf"
,
"ds"
,
"rnd"
]
output_names
=
[
"audio"
,
]
# net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
torch
.
onnx
.
export
(
net_g
,
(
test_phone
.
to
(
device
),
test_phone_lengths
.
to
(
device
),
test_pitch
.
to
(
device
),
test_pitchf
.
to
(
device
),
test_ds
.
to
(
device
),
test_rnd
.
to
(
device
),
),
ExportedPath
,
dynamic_axes
=
{
"phone"
:
[
1
],
"pitch"
:
[
1
],
"pitchf"
:
[
1
],
"rnd"
:
[
2
],
},
do_constant_folding
=
False
,
opset_version
=
13
,
verbose
=
False
,
input_names
=
input_names
,
output_names
=
output_names
,
)
return
"Finished"
infer/modules/train/extract/extract_f0_print.py
0 → 100644
View file @
9867304a
import
os
import
sys
import
traceback
import
parselmouth
now_dir
=
os
.
getcwd
()
sys
.
path
.
append
(
now_dir
)
import
logging
import
numpy
as
np
import
pyworld
from
infer.lib.audio
import
load_audio
logging
.
getLogger
(
"numba"
).
setLevel
(
logging
.
WARNING
)
from
multiprocessing
import
Process
exp_dir
=
sys
.
argv
[
1
]
f
=
open
(
"%s/extract_f0_feature.log"
%
exp_dir
,
"a+"
)
def
printt
(
strr
):
print
(
strr
)
f
.
write
(
"%s
\n
"
%
strr
)
f
.
flush
()
n_p
=
int
(
sys
.
argv
[
2
])
f0method
=
sys
.
argv
[
3
]
class
FeatureInput
(
object
):
def
__init__
(
self
,
samplerate
=
16000
,
hop_size
=
160
):
self
.
fs
=
samplerate
self
.
hop
=
hop_size
self
.
f0_bin
=
256
self
.
f0_max
=
1100.0
self
.
f0_min
=
50.0
self
.
f0_mel_min
=
1127
*
np
.
log
(
1
+
self
.
f0_min
/
700
)
self
.
f0_mel_max
=
1127
*
np
.
log
(
1
+
self
.
f0_max
/
700
)
def
compute_f0
(
self
,
path
,
f0_method
):
x
=
load_audio
(
path
,
self
.
fs
)
p_len
=
x
.
shape
[
0
]
//
self
.
hop
if
f0_method
==
"pm"
:
time_step
=
160
/
16000
*
1000
f0_min
=
50
f0_max
=
1100
f0
=
(
parselmouth
.
Sound
(
x
,
self
.
fs
)
.
to_pitch_ac
(
time_step
=
time_step
/
1000
,
voicing_threshold
=
0.6
,
pitch_floor
=
f0_min
,
pitch_ceiling
=
f0_max
,
)
.
selected_array
[
"frequency"
]
)
pad_size
=
(
p_len
-
len
(
f0
)
+
1
)
//
2
if
pad_size
>
0
or
p_len
-
len
(
f0
)
-
pad_size
>
0
:
f0
=
np
.
pad
(
f0
,
[[
pad_size
,
p_len
-
len
(
f0
)
-
pad_size
]],
mode
=
"constant"
)
elif
f0_method
==
"harvest"
:
f0
,
t
=
pyworld
.
harvest
(
x
.
astype
(
np
.
double
),
fs
=
self
.
fs
,
f0_ceil
=
self
.
f0_max
,
f0_floor
=
self
.
f0_min
,
frame_period
=
1000
*
self
.
hop
/
self
.
fs
,
)
f0
=
pyworld
.
stonemask
(
x
.
astype
(
np
.
double
),
f0
,
t
,
self
.
fs
)
elif
f0_method
==
"dio"
:
f0
,
t
=
pyworld
.
dio
(
x
.
astype
(
np
.
double
),
fs
=
self
.
fs
,
f0_ceil
=
self
.
f0_max
,
f0_floor
=
self
.
f0_min
,
frame_period
=
1000
*
self
.
hop
/
self
.
fs
,
)
f0
=
pyworld
.
stonemask
(
x
.
astype
(
np
.
double
),
f0
,
t
,
self
.
fs
)
elif
f0_method
==
"rmvpe"
:
if
hasattr
(
self
,
"model_rmvpe"
)
==
False
:
from
infer.lib.rmvpe
import
RMVPE
print
(
"Loading rmvpe model"
)
self
.
model_rmvpe
=
RMVPE
(
"assets/rmvpe/rmvpe.pt"
,
is_half
=
False
,
device
=
"cpu"
)
f0
=
self
.
model_rmvpe
.
infer_from_audio
(
x
,
thred
=
0.03
)
return
f0
def
coarse_f0
(
self
,
f0
):
f0_mel
=
1127
*
np
.
log
(
1
+
f0
/
700
)
f0_mel
[
f0_mel
>
0
]
=
(
f0_mel
[
f0_mel
>
0
]
-
self
.
f0_mel_min
)
*
(
self
.
f0_bin
-
2
)
/
(
self
.
f0_mel_max
-
self
.
f0_mel_min
)
+
1
# use 0 or 1
f0_mel
[
f0_mel
<=
1
]
=
1
f0_mel
[
f0_mel
>
self
.
f0_bin
-
1
]
=
self
.
f0_bin
-
1
f0_coarse
=
np
.
rint
(
f0_mel
).
astype
(
int
)
assert
f0_coarse
.
max
()
<=
255
and
f0_coarse
.
min
()
>=
1
,
(
f0_coarse
.
max
(),
f0_coarse
.
min
(),
)
return
f0_coarse
def
go
(
self
,
paths
,
f0_method
):
if
len
(
paths
)
==
0
:
printt
(
"no-f0-todo"
)
else
:
printt
(
"todo-f0-%s"
%
len
(
paths
))
n
=
max
(
len
(
paths
)
//
5
,
1
)
# 每个进程最多打印5条
for
idx
,
(
inp_path
,
opt_path1
,
opt_path2
)
in
enumerate
(
paths
):
try
:
if
idx
%
n
==
0
:
printt
(
"f0ing,now-%s,all-%s,-%s"
%
(
idx
,
len
(
paths
),
inp_path
))
if
(
os
.
path
.
exists
(
opt_path1
+
".npy"
)
==
True
and
os
.
path
.
exists
(
opt_path2
+
".npy"
)
==
True
):
continue
featur_pit
=
self
.
compute_f0
(
inp_path
,
f0_method
)
np
.
save
(
opt_path2
,
featur_pit
,
allow_pickle
=
False
,
)
# nsf
coarse_pit
=
self
.
coarse_f0
(
featur_pit
)
np
.
save
(
opt_path1
,
coarse_pit
,
allow_pickle
=
False
,
)
# ori
except
:
printt
(
"f0fail-%s-%s-%s"
%
(
idx
,
inp_path
,
traceback
.
format_exc
()))
if
__name__
==
"__main__"
:
# exp_dir=r"E:\codes\py39\dataset\mi-test"
# n_p=16
# f = open("%s/log_extract_f0.log"%exp_dir, "w")
printt
(
" "
.
join
(
sys
.
argv
))
featureInput
=
FeatureInput
()
paths
=
[]
inp_root
=
"%s/1_16k_wavs"
%
(
exp_dir
)
opt_root1
=
"%s/2a_f0"
%
(
exp_dir
)
opt_root2
=
"%s/2b-f0nsf"
%
(
exp_dir
)
os
.
makedirs
(
opt_root1
,
exist_ok
=
True
)
os
.
makedirs
(
opt_root2
,
exist_ok
=
True
)
for
name
in
sorted
(
list
(
os
.
listdir
(
inp_root
))):
inp_path
=
"%s/%s"
%
(
inp_root
,
name
)
if
"spec"
in
inp_path
:
continue
opt_path1
=
"%s/%s"
%
(
opt_root1
,
name
)
opt_path2
=
"%s/%s"
%
(
opt_root2
,
name
)
paths
.
append
([
inp_path
,
opt_path1
,
opt_path2
])
ps
=
[]
for
i
in
range
(
n_p
):
p
=
Process
(
target
=
featureInput
.
go
,
args
=
(
paths
[
i
::
n_p
],
f0method
,
),
)
ps
.
append
(
p
)
p
.
start
()
for
i
in
range
(
n_p
):
ps
[
i
].
join
()
infer/modules/train/extract/extract_f0_rmvpe.py
0 → 100644
View file @
9867304a
import
os
import
sys
import
traceback
import
parselmouth
now_dir
=
os
.
getcwd
()
sys
.
path
.
append
(
now_dir
)
import
logging
import
numpy
as
np
import
pyworld
from
infer.lib.audio
import
load_audio
logging
.
getLogger
(
"numba"
).
setLevel
(
logging
.
WARNING
)
n_part
=
int
(
sys
.
argv
[
1
])
i_part
=
int
(
sys
.
argv
[
2
])
i_gpu
=
sys
.
argv
[
3
]
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
str
(
i_gpu
)
exp_dir
=
sys
.
argv
[
4
]
is_half
=
sys
.
argv
[
5
]
f
=
open
(
"%s/extract_f0_feature.log"
%
exp_dir
,
"a+"
)
def
printt
(
strr
):
print
(
strr
)
f
.
write
(
"%s
\n
"
%
strr
)
f
.
flush
()
class
FeatureInput
(
object
):
def
__init__
(
self
,
samplerate
=
16000
,
hop_size
=
160
):
self
.
fs
=
samplerate
self
.
hop
=
hop_size
self
.
f0_bin
=
256
self
.
f0_max
=
1100.0
self
.
f0_min
=
50.0
self
.
f0_mel_min
=
1127
*
np
.
log
(
1
+
self
.
f0_min
/
700
)
self
.
f0_mel_max
=
1127
*
np
.
log
(
1
+
self
.
f0_max
/
700
)
def
compute_f0
(
self
,
path
,
f0_method
):
x
=
load_audio
(
path
,
self
.
fs
)
# p_len = x.shape[0] // self.hop
if
f0_method
==
"rmvpe"
:
if
hasattr
(
self
,
"model_rmvpe"
)
==
False
:
from
infer.lib.rmvpe
import
RMVPE
print
(
"Loading rmvpe model"
)
self
.
model_rmvpe
=
RMVPE
(
"assets/rmvpe/rmvpe.pt"
,
is_half
=
is_half
,
device
=
"cuda"
)
f0
=
self
.
model_rmvpe
.
infer_from_audio
(
x
,
thred
=
0.03
)
return
f0
def
coarse_f0
(
self
,
f0
):
f0_mel
=
1127
*
np
.
log
(
1
+
f0
/
700
)
f0_mel
[
f0_mel
>
0
]
=
(
f0_mel
[
f0_mel
>
0
]
-
self
.
f0_mel_min
)
*
(
self
.
f0_bin
-
2
)
/
(
self
.
f0_mel_max
-
self
.
f0_mel_min
)
+
1
# use 0 or 1
f0_mel
[
f0_mel
<=
1
]
=
1
f0_mel
[
f0_mel
>
self
.
f0_bin
-
1
]
=
self
.
f0_bin
-
1
f0_coarse
=
np
.
rint
(
f0_mel
).
astype
(
int
)
assert
f0_coarse
.
max
()
<=
255
and
f0_coarse
.
min
()
>=
1
,
(
f0_coarse
.
max
(),
f0_coarse
.
min
(),
)
return
f0_coarse
def
go
(
self
,
paths
,
f0_method
):
if
len
(
paths
)
==
0
:
printt
(
"no-f0-todo"
)
else
:
printt
(
"todo-f0-%s"
%
len
(
paths
))
n
=
max
(
len
(
paths
)
//
5
,
1
)
# 每个进程最多打印5条
for
idx
,
(
inp_path
,
opt_path1
,
opt_path2
)
in
enumerate
(
paths
):
try
:
if
idx
%
n
==
0
:
printt
(
"f0ing,now-%s,all-%s,-%s"
%
(
idx
,
len
(
paths
),
inp_path
))
if
(
os
.
path
.
exists
(
opt_path1
+
".npy"
)
==
True
and
os
.
path
.
exists
(
opt_path2
+
".npy"
)
==
True
):
continue
featur_pit
=
self
.
compute_f0
(
inp_path
,
f0_method
)
np
.
save
(
opt_path2
,
featur_pit
,
allow_pickle
=
False
,
)
# nsf
coarse_pit
=
self
.
coarse_f0
(
featur_pit
)
np
.
save
(
opt_path1
,
coarse_pit
,
allow_pickle
=
False
,
)
# ori
except
:
printt
(
"f0fail-%s-%s-%s"
%
(
idx
,
inp_path
,
traceback
.
format_exc
()))
if
__name__
==
"__main__"
:
# exp_dir=r"E:\codes\py39\dataset\mi-test"
# n_p=16
# f = open("%s/log_extract_f0.log"%exp_dir, "w")
printt
(
" "
.
join
(
sys
.
argv
))
featureInput
=
FeatureInput
()
paths
=
[]
inp_root
=
"%s/1_16k_wavs"
%
(
exp_dir
)
opt_root1
=
"%s/2a_f0"
%
(
exp_dir
)
opt_root2
=
"%s/2b-f0nsf"
%
(
exp_dir
)
os
.
makedirs
(
opt_root1
,
exist_ok
=
True
)
os
.
makedirs
(
opt_root2
,
exist_ok
=
True
)
for
name
in
sorted
(
list
(
os
.
listdir
(
inp_root
))):
inp_path
=
"%s/%s"
%
(
inp_root
,
name
)
if
"spec"
in
inp_path
:
continue
opt_path1
=
"%s/%s"
%
(
opt_root1
,
name
)
opt_path2
=
"%s/%s"
%
(
opt_root2
,
name
)
paths
.
append
([
inp_path
,
opt_path1
,
opt_path2
])
try
:
featureInput
.
go
(
paths
[
i_part
::
n_part
],
"rmvpe"
)
except
:
printt
(
"f0_all_fail-%s"
%
(
traceback
.
format_exc
()))
# ps = []
# for i in range(n_p):
# p = Process(
# target=featureInput.go,
# args=(
# paths[i::n_p],
# f0method,
# ),
# )
# ps.append(p)
# p.start()
# for i in range(n_p):
# ps[i].join()
infer/modules/train/extract/extract_f0_rmvpe_dml.py
0 → 100644
View file @
9867304a
import
os
import
sys
import
traceback
import
parselmouth
now_dir
=
os
.
getcwd
()
sys
.
path
.
append
(
now_dir
)
import
logging
import
numpy
as
np
import
pyworld
from
infer.lib.audio
import
load_audio
logging
.
getLogger
(
"numba"
).
setLevel
(
logging
.
WARNING
)
exp_dir
=
sys
.
argv
[
1
]
import
torch_directml
device
=
torch_directml
.
device
(
torch_directml
.
default_device
())
f
=
open
(
"%s/extract_f0_feature.log"
%
exp_dir
,
"a+"
)
def
printt
(
strr
):
print
(
strr
)
f
.
write
(
"%s
\n
"
%
strr
)
f
.
flush
()
class
FeatureInput
(
object
):
def
__init__
(
self
,
samplerate
=
16000
,
hop_size
=
160
):
self
.
fs
=
samplerate
self
.
hop
=
hop_size
self
.
f0_bin
=
256
self
.
f0_max
=
1100.0
self
.
f0_min
=
50.0
self
.
f0_mel_min
=
1127
*
np
.
log
(
1
+
self
.
f0_min
/
700
)
self
.
f0_mel_max
=
1127
*
np
.
log
(
1
+
self
.
f0_max
/
700
)
def
compute_f0
(
self
,
path
,
f0_method
):
x
=
load_audio
(
path
,
self
.
fs
)
# p_len = x.shape[0] // self.hop
if
f0_method
==
"rmvpe"
:
if
hasattr
(
self
,
"model_rmvpe"
)
==
False
:
from
infer.lib.rmvpe
import
RMVPE
print
(
"Loading rmvpe model"
)
self
.
model_rmvpe
=
RMVPE
(
"assets/rmvpe/rmvpe.pt"
,
is_half
=
False
,
device
=
device
)
f0
=
self
.
model_rmvpe
.
infer_from_audio
(
x
,
thred
=
0.03
)
return
f0
def
coarse_f0
(
self
,
f0
):
f0_mel
=
1127
*
np
.
log
(
1
+
f0
/
700
)
f0_mel
[
f0_mel
>
0
]
=
(
f0_mel
[
f0_mel
>
0
]
-
self
.
f0_mel_min
)
*
(
self
.
f0_bin
-
2
)
/
(
self
.
f0_mel_max
-
self
.
f0_mel_min
)
+
1
# use 0 or 1
f0_mel
[
f0_mel
<=
1
]
=
1
f0_mel
[
f0_mel
>
self
.
f0_bin
-
1
]
=
self
.
f0_bin
-
1
f0_coarse
=
np
.
rint
(
f0_mel
).
astype
(
int
)
assert
f0_coarse
.
max
()
<=
255
and
f0_coarse
.
min
()
>=
1
,
(
f0_coarse
.
max
(),
f0_coarse
.
min
(),
)
return
f0_coarse
def
go
(
self
,
paths
,
f0_method
):
if
len
(
paths
)
==
0
:
printt
(
"no-f0-todo"
)
else
:
printt
(
"todo-f0-%s"
%
len
(
paths
))
n
=
max
(
len
(
paths
)
//
5
,
1
)
# 每个进程最多打印5条
for
idx
,
(
inp_path
,
opt_path1
,
opt_path2
)
in
enumerate
(
paths
):
try
:
if
idx
%
n
==
0
:
printt
(
"f0ing,now-%s,all-%s,-%s"
%
(
idx
,
len
(
paths
),
inp_path
))
if
(
os
.
path
.
exists
(
opt_path1
+
".npy"
)
==
True
and
os
.
path
.
exists
(
opt_path2
+
".npy"
)
==
True
):
continue
featur_pit
=
self
.
compute_f0
(
inp_path
,
f0_method
)
np
.
save
(
opt_path2
,
featur_pit
,
allow_pickle
=
False
,
)
# nsf
coarse_pit
=
self
.
coarse_f0
(
featur_pit
)
np
.
save
(
opt_path1
,
coarse_pit
,
allow_pickle
=
False
,
)
# ori
except
:
printt
(
"f0fail-%s-%s-%s"
%
(
idx
,
inp_path
,
traceback
.
format_exc
()))
if
__name__
==
"__main__"
:
# exp_dir=r"E:\codes\py39\dataset\mi-test"
# n_p=16
# f = open("%s/log_extract_f0.log"%exp_dir, "w")
printt
(
" "
.
join
(
sys
.
argv
))
featureInput
=
FeatureInput
()
paths
=
[]
inp_root
=
"%s/1_16k_wavs"
%
(
exp_dir
)
opt_root1
=
"%s/2a_f0"
%
(
exp_dir
)
opt_root2
=
"%s/2b-f0nsf"
%
(
exp_dir
)
os
.
makedirs
(
opt_root1
,
exist_ok
=
True
)
os
.
makedirs
(
opt_root2
,
exist_ok
=
True
)
for
name
in
sorted
(
list
(
os
.
listdir
(
inp_root
))):
inp_path
=
"%s/%s"
%
(
inp_root
,
name
)
if
"spec"
in
inp_path
:
continue
opt_path1
=
"%s/%s"
%
(
opt_root1
,
name
)
opt_path2
=
"%s/%s"
%
(
opt_root2
,
name
)
paths
.
append
([
inp_path
,
opt_path1
,
opt_path2
])
try
:
featureInput
.
go
(
paths
,
"rmvpe"
)
except
:
printt
(
"f0_all_fail-%s"
%
(
traceback
.
format_exc
()))
# ps = []
# for i in range(n_p):
# p = Process(
# target=featureInput.go,
# args=(
# paths[i::n_p],
# f0method,
# ),
# )
# ps.append(p)
# p.start()
# for i in range(n_p):
# ps[i].join()
infer/modules/train/extract_feature_print.py
0 → 100644
View file @
9867304a
import
os
import
sys
import
traceback
os
.
environ
[
"PYTORCH_ENABLE_MPS_FALLBACK"
]
=
"1"
os
.
environ
[
"PYTORCH_MPS_HIGH_WATERMARK_RATIO"
]
=
"0.0"
device
=
sys
.
argv
[
1
]
n_part
=
int
(
sys
.
argv
[
2
])
i_part
=
int
(
sys
.
argv
[
3
])
if
len
(
sys
.
argv
)
==
7
:
exp_dir
=
sys
.
argv
[
4
]
version
=
sys
.
argv
[
5
]
is_half
=
sys
.
argv
[
6
].
lower
()
==
"true"
else
:
i_gpu
=
sys
.
argv
[
4
]
exp_dir
=
sys
.
argv
[
5
]
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
str
(
i_gpu
)
version
=
sys
.
argv
[
6
]
is_half
=
sys
.
argv
[
7
].
lower
()
==
"true"
import
fairseq
import
numpy
as
np
import
soundfile
as
sf
import
torch
import
torch.nn.functional
as
F
if
"privateuseone"
not
in
device
:
device
=
"cpu"
if
torch
.
cuda
.
is_available
():
device
=
"cuda"
elif
torch
.
backends
.
mps
.
is_available
():
device
=
"mps"
else
:
import
torch_directml
device
=
torch_directml
.
device
(
torch_directml
.
default_device
())
def
forward_dml
(
ctx
,
x
,
scale
):
ctx
.
scale
=
scale
res
=
x
.
clone
().
detach
()
return
res
fairseq
.
modules
.
grad_multiply
.
GradMultiply
.
forward
=
forward_dml
f
=
open
(
"%s/extract_f0_feature.log"
%
exp_dir
,
"a+"
)
def
printt
(
strr
):
print
(
strr
)
f
.
write
(
"%s
\n
"
%
strr
)
f
.
flush
()
printt
(
" "
.
join
(
sys
.
argv
))
model_path
=
"assets/hubert/hubert_base.pt"
printt
(
"exp_dir: "
+
exp_dir
)
wavPath
=
"%s/1_16k_wavs"
%
exp_dir
outPath
=
(
"%s/3_feature256"
%
exp_dir
if
version
==
"v1"
else
"%s/3_feature768"
%
exp_dir
)
os
.
makedirs
(
outPath
,
exist_ok
=
True
)
# wave must be 16k, hop_size=320
def
readwave
(
wav_path
,
normalize
=
False
):
wav
,
sr
=
sf
.
read
(
wav_path
)
assert
sr
==
16000
feats
=
torch
.
from_numpy
(
wav
).
float
()
if
feats
.
dim
()
==
2
:
# double channels
feats
=
feats
.
mean
(
-
1
)
assert
feats
.
dim
()
==
1
,
feats
.
dim
()
if
normalize
:
with
torch
.
no_grad
():
feats
=
F
.
layer_norm
(
feats
,
feats
.
shape
)
feats
=
feats
.
view
(
1
,
-
1
)
return
feats
# HuBERT model
printt
(
"load model(s) from {}"
.
format
(
model_path
))
# if hubert model is exist
if
os
.
access
(
model_path
,
os
.
F_OK
)
==
False
:
printt
(
"Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main"
%
model_path
)
exit
(
0
)
models
,
saved_cfg
,
task
=
fairseq
.
checkpoint_utils
.
load_model_ensemble_and_task
(
[
model_path
],
suffix
=
""
,
)
model
=
models
[
0
]
model
=
model
.
to
(
device
)
printt
(
"move model to %s"
%
device
)
if
is_half
:
if
device
not
in
[
"mps"
,
"cpu"
]:
model
=
model
.
half
()
model
.
eval
()
todo
=
sorted
(
list
(
os
.
listdir
(
wavPath
)))[
i_part
::
n_part
]
n
=
max
(
1
,
len
(
todo
)
//
10
)
# 最多打印十条
if
len
(
todo
)
==
0
:
printt
(
"no-feature-todo"
)
else
:
printt
(
"all-feature-%s"
%
len
(
todo
))
for
idx
,
file
in
enumerate
(
todo
):
try
:
if
file
.
endswith
(
".wav"
):
wav_path
=
"%s/%s"
%
(
wavPath
,
file
)
out_path
=
"%s/%s"
%
(
outPath
,
file
.
replace
(
"wav"
,
"npy"
))
if
os
.
path
.
exists
(
out_path
):
continue
feats
=
readwave
(
wav_path
,
normalize
=
saved_cfg
.
task
.
normalize
)
padding_mask
=
torch
.
BoolTensor
(
feats
.
shape
).
fill_
(
False
)
inputs
=
{
"source"
:
(
feats
.
half
().
to
(
device
)
if
is_half
and
device
not
in
[
"mps"
,
"cpu"
]
else
feats
.
to
(
device
)
),
"padding_mask"
:
padding_mask
.
to
(
device
),
"output_layer"
:
9
if
version
==
"v1"
else
12
,
# layer 9
}
with
torch
.
no_grad
():
logits
=
model
.
extract_features
(
**
inputs
)
feats
=
(
model
.
final_proj
(
logits
[
0
])
if
version
==
"v1"
else
logits
[
0
]
)
feats
=
feats
.
squeeze
(
0
).
float
().
cpu
().
numpy
()
if
np
.
isnan
(
feats
).
sum
()
==
0
:
np
.
save
(
out_path
,
feats
,
allow_pickle
=
False
)
else
:
printt
(
"%s-contains nan"
%
file
)
if
idx
%
n
==
0
:
printt
(
"now-%s,all-%s,%s,%s"
%
(
len
(
todo
),
idx
,
file
,
feats
.
shape
))
except
:
printt
(
traceback
.
format_exc
())
printt
(
"all-feature-done"
)
infer/modules/train/preprocess.py
0 → 100644
View file @
9867304a
import
multiprocessing
import
os
import
sys
from
scipy
import
signal
now_dir
=
os
.
getcwd
()
sys
.
path
.
append
(
now_dir
)
print
(
*
sys
.
argv
[
1
:])
inp_root
=
sys
.
argv
[
1
]
sr
=
int
(
sys
.
argv
[
2
])
n_p
=
int
(
sys
.
argv
[
3
])
exp_dir
=
sys
.
argv
[
4
]
noparallel
=
sys
.
argv
[
5
]
==
"True"
per
=
float
(
sys
.
argv
[
6
])
import
os
import
traceback
import
librosa
import
numpy
as
np
from
scipy.io
import
wavfile
from
infer.lib.audio
import
load_audio
from
infer.lib.slicer2
import
Slicer
f
=
open
(
"%s/preprocess.log"
%
exp_dir
,
"a+"
)
def
println
(
strr
):
print
(
strr
)
f
.
write
(
"%s
\n
"
%
strr
)
f
.
flush
()
class
PreProcess
:
def
__init__
(
self
,
sr
,
exp_dir
,
per
=
3.7
):
self
.
slicer
=
Slicer
(
sr
=
sr
,
threshold
=-
42
,
min_length
=
1500
,
min_interval
=
400
,
hop_size
=
15
,
max_sil_kept
=
500
,
)
self
.
sr
=
sr
self
.
bh
,
self
.
ah
=
signal
.
butter
(
N
=
5
,
Wn
=
48
,
btype
=
"high"
,
fs
=
self
.
sr
)
self
.
per
=
per
self
.
overlap
=
0.3
self
.
tail
=
self
.
per
+
self
.
overlap
self
.
max
=
0.9
self
.
alpha
=
0.75
self
.
exp_dir
=
exp_dir
self
.
gt_wavs_dir
=
"%s/0_gt_wavs"
%
exp_dir
self
.
wavs16k_dir
=
"%s/1_16k_wavs"
%
exp_dir
os
.
makedirs
(
self
.
exp_dir
,
exist_ok
=
True
)
os
.
makedirs
(
self
.
gt_wavs_dir
,
exist_ok
=
True
)
os
.
makedirs
(
self
.
wavs16k_dir
,
exist_ok
=
True
)
def
norm_write
(
self
,
tmp_audio
,
idx0
,
idx1
):
tmp_max
=
np
.
abs
(
tmp_audio
).
max
()
if
tmp_max
>
2.5
:
print
(
"%s-%s-%s-filtered"
%
(
idx0
,
idx1
,
tmp_max
))
return
tmp_audio
=
(
tmp_audio
/
tmp_max
*
(
self
.
max
*
self
.
alpha
))
+
(
1
-
self
.
alpha
)
*
tmp_audio
wavfile
.
write
(
"%s/%s_%s.wav"
%
(
self
.
gt_wavs_dir
,
idx0
,
idx1
),
self
.
sr
,
tmp_audio
.
astype
(
np
.
float32
),
)
tmp_audio
=
librosa
.
resample
(
tmp_audio
,
orig_sr
=
self
.
sr
,
target_sr
=
16000
)
# , res_type="soxr_vhq"
wavfile
.
write
(
"%s/%s_%s.wav"
%
(
self
.
wavs16k_dir
,
idx0
,
idx1
),
16000
,
tmp_audio
.
astype
(
np
.
float32
),
)
def
pipeline
(
self
,
path
,
idx0
):
try
:
audio
=
load_audio
(
path
,
self
.
sr
)
# zero phased digital filter cause pre-ringing noise...
# audio = signal.filtfilt(self.bh, self.ah, audio)
audio
=
signal
.
lfilter
(
self
.
bh
,
self
.
ah
,
audio
)
idx1
=
0
for
audio
in
self
.
slicer
.
slice
(
audio
):
i
=
0
while
1
:
start
=
int
(
self
.
sr
*
(
self
.
per
-
self
.
overlap
)
*
i
)
i
+=
1
if
len
(
audio
[
start
:])
>
self
.
tail
*
self
.
sr
:
tmp_audio
=
audio
[
start
:
start
+
int
(
self
.
per
*
self
.
sr
)]
self
.
norm_write
(
tmp_audio
,
idx0
,
idx1
)
idx1
+=
1
else
:
tmp_audio
=
audio
[
start
:]
idx1
+=
1
break
self
.
norm_write
(
tmp_audio
,
idx0
,
idx1
)
println
(
"%s
\t
-> Success"
%
path
)
except
:
println
(
"%s
\t
-> %s"
%
(
path
,
traceback
.
format_exc
()))
def
pipeline_mp
(
self
,
infos
):
for
path
,
idx0
in
infos
:
self
.
pipeline
(
path
,
idx0
)
def
pipeline_mp_inp_dir
(
self
,
inp_root
,
n_p
):
try
:
infos
=
[
(
"%s/%s"
%
(
inp_root
,
name
),
idx
)
for
idx
,
name
in
enumerate
(
sorted
(
list
(
os
.
listdir
(
inp_root
))))
]
if
noparallel
:
for
i
in
range
(
n_p
):
self
.
pipeline_mp
(
infos
[
i
::
n_p
])
else
:
ps
=
[]
for
i
in
range
(
n_p
):
p
=
multiprocessing
.
Process
(
target
=
self
.
pipeline_mp
,
args
=
(
infos
[
i
::
n_p
],)
)
ps
.
append
(
p
)
p
.
start
()
for
i
in
range
(
n_p
):
ps
[
i
].
join
()
except
:
println
(
"Fail. %s"
%
traceback
.
format_exc
())
def
preprocess_trainset
(
inp_root
,
sr
,
n_p
,
exp_dir
,
per
):
pp
=
PreProcess
(
sr
,
exp_dir
,
per
)
println
(
"start preprocess"
)
pp
.
pipeline_mp_inp_dir
(
inp_root
,
n_p
)
println
(
"end preprocess"
)
if
__name__
==
"__main__"
:
preprocess_trainset
(
inp_root
,
sr
,
n_p
,
exp_dir
,
per
)
Prev
1
…
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment