Commit 5c023842 authored by chenpangpang's avatar chenpangpang
Browse files

feat: 增加LatentSync

parent 822b66ca
Pipeline #2211 canceled with stages
from dataclasses import dataclass
from typing import Dict
from typing import Iterable, Optional
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
from .transcribe import transcribe as transcribe_function
from .decoding import detect_language as detect_language_function, decode as decode_function
@dataclass
class ModelDimensions:
n_mels: int
n_audio_ctx: int
n_audio_state: int
n_audio_head: int
n_audio_layer: int
n_vocab: int
n_text_ctx: int
n_text_state: int
n_text_head: int
n_text_layer: int
class LayerNorm(nn.LayerNorm):
def forward(self, x: Tensor) -> Tensor:
return super().forward(x.float()).type(x.dtype)
class Linear(nn.Linear):
def forward(self, x: Tensor) -> Tensor:
return F.linear(
x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)
)
class Conv1d(nn.Conv1d):
def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
return super()._conv_forward(
x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
)
def sinusoids(length, channels, max_timescale=10000):
"""Returns sinusoids for positional embedding"""
assert channels % 2 == 0
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
class MultiHeadAttention(nn.Module):
def __init__(self, n_state: int, n_head: int):
super().__init__()
self.n_head = n_head
self.query = Linear(n_state, n_state)
self.key = Linear(n_state, n_state, bias=False)
self.value = Linear(n_state, n_state)
self.out = Linear(n_state, n_state)
def forward(
self,
x: Tensor,
xa: Optional[Tensor] = None,
mask: Optional[Tensor] = None,
kv_cache: Optional[dict] = None,
):
q = self.query(x)
if kv_cache is None or xa is None:
# hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
# otherwise, perform key/value projections for self- or cross-attention as usual.
k = self.key(x if xa is None else xa)
v = self.value(x if xa is None else xa)
else:
# for cross-attention, calculate keys and values once and reuse in subsequent calls.
k = kv_cache.get(self.key, self.key(xa))
v = kv_cache.get(self.value, self.value(xa))
wv = self.qkv_attention(q, k, v, mask)
return self.out(wv)
def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
n_batch, n_ctx, n_state = q.shape
scale = (n_state // self.n_head) ** -0.25
q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
qk = q @ k
if mask is not None:
qk = qk + mask[:n_ctx, :n_ctx]
w = F.softmax(qk.float(), dim=-1).to(q.dtype)
return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
class ResidualAttentionBlock(nn.Module):
def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
super().__init__()
self.attn = MultiHeadAttention(n_state, n_head)
self.attn_ln = LayerNorm(n_state)
self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
n_mlp = n_state * 4
self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
self.mlp_ln = LayerNorm(n_state)
def forward(
self,
x: Tensor,
xa: Optional[Tensor] = None,
mask: Optional[Tensor] = None,
kv_cache: Optional[dict] = None,
):
x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
if self.cross_attn:
x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)
x = x + self.mlp(self.mlp_ln(x))
return x
class AudioEncoder(nn.Module):
def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
super().__init__()
self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
[ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
)
self.ln_post = LayerNorm(n_state)
def forward(self, x: Tensor, include_embeddings: bool = False):
"""
x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
the mel spectrogram of the audio
include_embeddings: bool
whether to include intermediate steps in the output
"""
x = F.gelu(self.conv1(x))
x = F.gelu(self.conv2(x))
x = x.permute(0, 2, 1)
assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
x = (x + self.positional_embedding).to(x.dtype)
if include_embeddings:
embeddings = [x.cpu().detach().numpy()]
for block in self.blocks:
x = block(x)
if include_embeddings:
embeddings.append(x.cpu().detach().numpy())
x = self.ln_post(x)
if include_embeddings:
embeddings = np.stack(embeddings, axis=1)
return x, embeddings
else:
return x
class TextDecoder(nn.Module):
def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
super().__init__()
self.token_embedding = nn.Embedding(n_vocab, n_state)
self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
[ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
)
self.ln = LayerNorm(n_state)
mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
self.register_buffer("mask", mask, persistent=False)
def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None, include_embeddings: bool = False):
"""
x : torch.LongTensor, shape = (batch_size, <= n_ctx)
the text tokens
xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
the encoded audio features to be attended on
include_embeddings : bool
Whether to include intermediate values in the output to this function
"""
offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
x = x.to(xa.dtype)
if include_embeddings:
embeddings = [x.cpu().detach().numpy()]
for block in self.blocks:
x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
if include_embeddings:
embeddings.append(x.cpu().detach().numpy())
x = self.ln(x)
logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float()
if include_embeddings:
embeddings = np.stack(embeddings, axis=1)
return logits, embeddings
else:
return logits
class Whisper(nn.Module):
def __init__(self, dims: ModelDimensions):
super().__init__()
self.dims = dims
self.encoder = AudioEncoder(
self.dims.n_mels,
self.dims.n_audio_ctx,
self.dims.n_audio_state,
self.dims.n_audio_head,
self.dims.n_audio_layer,
)
self.decoder = TextDecoder(
self.dims.n_vocab,
self.dims.n_text_ctx,
self.dims.n_text_state,
self.dims.n_text_head,
self.dims.n_text_layer,
)
def embed_audio(self, mel: torch.Tensor):
return self.encoder.forward(mel)
def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
return self.decoder.forward(tokens, audio_features)
def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
return self.decoder(tokens, self.encoder(mel))
@property
def device(self):
return next(self.parameters()).device
@property
def is_multilingual(self):
return self.dims.n_vocab == 51865
def install_kv_cache_hooks(self, cache: Optional[dict] = None):
"""
The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
tensors calculated for the previous positions. This method returns a dictionary that stores
all caches, and the necessary hooks for the key and value projection modules that save the
intermediate tensors to be reused during later calculations.
Returns
-------
cache : Dict[nn.Module, torch.Tensor]
A dictionary object mapping the key/value projection modules to its cache
hooks : List[RemovableHandle]
List of PyTorch RemovableHandle objects to stop the hooks to be called
"""
cache = {**cache} if cache is not None else {}
hooks = []
def save_to_cache(module, _, output):
if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]:
cache[module] = output # save as-is, for the first token or cross attention
else:
cache[module] = torch.cat([cache[module], output], dim=1).detach()
return cache[module]
def install_hooks(layer: nn.Module):
if isinstance(layer, MultiHeadAttention):
hooks.append(layer.key.register_forward_hook(save_to_cache))
hooks.append(layer.value.register_forward_hook(save_to_cache))
self.decoder.apply(install_hooks)
return cache, hooks
detect_language = detect_language_function
transcribe = transcribe_function
decode = decode_function
from .basic import BasicTextNormalizer
from .english import EnglishTextNormalizer
import re
import unicodedata
import regex
# non-ASCII letters that are not separated by "NFKD" normalization
ADDITIONAL_DIACRITICS = {
"œ": "oe",
"Œ": "OE",
"ø": "o",
"Ø": "O",
"æ": "ae",
"Æ": "AE",
"ß": "ss",
"ẞ": "SS",
"đ": "d",
"Đ": "D",
"ð": "d",
"Ð": "D",
"þ": "th",
"Þ": "th",
"ł": "l",
"Ł": "L",
}
def remove_symbols_and_diacritics(s: str, keep=""):
"""
Replace any other markers, symbols, and punctuations with a space,
and drop any diacritics (category 'Mn' and some manual mappings)
"""
return "".join(
c
if c in keep
else ADDITIONAL_DIACRITICS[c]
if c in ADDITIONAL_DIACRITICS
else ""
if unicodedata.category(c) == "Mn"
else " "
if unicodedata.category(c)[0] in "MSP"
else c
for c in unicodedata.normalize("NFKD", s)
)
def remove_symbols(s: str):
"""
Replace any other markers, symbols, punctuations with a space, keeping diacritics
"""
return "".join(
" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
)
class BasicTextNormalizer:
def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
self.split_letters = split_letters
def __call__(self, s: str):
s = s.lower()
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
s = self.clean(s).lower()
if self.split_letters:
s = " ".join(regex.findall(r"\X", s, regex.U))
s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space
return s
{
"accessorise": "accessorize",
"accessorised": "accessorized",
"accessorises": "accessorizes",
"accessorising": "accessorizing",
"acclimatisation": "acclimatization",
"acclimatise": "acclimatize",
"acclimatised": "acclimatized",
"acclimatises": "acclimatizes",
"acclimatising": "acclimatizing",
"accoutrements": "accouterments",
"aeon": "eon",
"aeons": "eons",
"aerogramme": "aerogram",
"aerogrammes": "aerograms",
"aeroplane": "airplane",
"aeroplanes": "airplanes",
"aesthete": "esthete",
"aesthetes": "esthetes",
"aesthetic": "esthetic",
"aesthetically": "esthetically",
"aesthetics": "esthetics",
"aetiology": "etiology",
"ageing": "aging",
"aggrandisement": "aggrandizement",
"agonise": "agonize",
"agonised": "agonized",
"agonises": "agonizes",
"agonising": "agonizing",
"agonisingly": "agonizingly",
"almanack": "almanac",
"almanacks": "almanacs",
"aluminium": "aluminum",
"amortisable": "amortizable",
"amortisation": "amortization",
"amortisations": "amortizations",
"amortise": "amortize",
"amortised": "amortized",
"amortises": "amortizes",
"amortising": "amortizing",
"amphitheatre": "amphitheater",
"amphitheatres": "amphitheaters",
"anaemia": "anemia",
"anaemic": "anemic",
"anaesthesia": "anesthesia",
"anaesthetic": "anesthetic",
"anaesthetics": "anesthetics",
"anaesthetise": "anesthetize",
"anaesthetised": "anesthetized",
"anaesthetises": "anesthetizes",
"anaesthetising": "anesthetizing",
"anaesthetist": "anesthetist",
"anaesthetists": "anesthetists",
"anaesthetize": "anesthetize",
"anaesthetized": "anesthetized",
"anaesthetizes": "anesthetizes",
"anaesthetizing": "anesthetizing",
"analogue": "analog",
"analogues": "analogs",
"analyse": "analyze",
"analysed": "analyzed",
"analyses": "analyzes",
"analysing": "analyzing",
"anglicise": "anglicize",
"anglicised": "anglicized",
"anglicises": "anglicizes",
"anglicising": "anglicizing",
"annualised": "annualized",
"antagonise": "antagonize",
"antagonised": "antagonized",
"antagonises": "antagonizes",
"antagonising": "antagonizing",
"apologise": "apologize",
"apologised": "apologized",
"apologises": "apologizes",
"apologising": "apologizing",
"appal": "appall",
"appals": "appalls",
"appetiser": "appetizer",
"appetisers": "appetizers",
"appetising": "appetizing",
"appetisingly": "appetizingly",
"arbour": "arbor",
"arbours": "arbors",
"archeological": "archaeological",
"archaeologically": "archeologically",
"archaeologist": "archeologist",
"archaeologists": "archeologists",
"archaeology": "archeology</span>",
"ardour": "ardor",
"armour": "armor",
"armoured": "armored",
"armourer": "armorer",
"armourers": "armorers",
"armouries": "armories",
"armoury": "armory",
"artefact": "artifact",
"artefacts": "artifacts",
"authorise": "authorize",
"authorised": "authorized",
"authorises": "authorizes",
"authorising": "authorizing",
"axe": "ax",
"backpedalled": "backpedaled",
"backpedalling": "backpedaling",
"bannister": "banister",
"bannisters": "banisters",
"baptise": "baptize",
"baptised": "baptized",
"baptises": "baptizes",
"baptising": "baptizing",
"bastardise": "bastardize",
"bastardised": "bastardized",
"bastardises": "bastardizes",
"bastardising": "bastardizing",
"battleax": "battleaxe",
"baulk": "balk",
"baulked": "balked",
"baulking": "balking",
"baulks": "balks",
"bedevilled": "bedeviled",
"bedevilling": "bedeviling",
"behaviour": "behavior",
"behavioural": "behavioral",
"behaviourism": "behaviorism",
"behaviourist": "behaviorist",
"behaviourists": "behaviorists",
"behaviours": "behaviors",
"behove": "behoove",
"behoved": "behooved",
"behoves": "behooves",
"bejewelled": "bejeweled",
"belabour": "belabor",
"belaboured": "belabored",
"belabouring": "belaboring",
"belabours": "belabors",
"bevelled": "beveled",
"bevvies": "bevies",
"bevvy": "bevy",
"biassed": "biased",
"biassing": "biasing",
"bingeing": "binging",
"bougainvillaea": "bougainvillea",
"bougainvillaeas": "bougainvilleas",
"bowdlerise": "bowdlerize",
"bowdlerised": "bowdlerized",
"bowdlerises": "bowdlerizes",
"bowdlerising": "bowdlerizing",
"breathalyse": "breathalyze",
"breathalysed": "breathalyzed",
"breathalyser": "breathalyzer",
"breathalysers": "breathalyzers",
"breathalyses": "breathalyzes",
"breathalysing": "breathalyzing",
"brutalise": "brutalize",
"brutalised": "brutalized",
"brutalises": "brutalizes",
"brutalising": "brutalizing",
"busses": "buses",
"bussing": "busing",
"caesarean": "cesarean",
"caesareans": "cesareans",
"calibre": "caliber",
"calibres": "calibers",
"calliper": "caliper",
"callipers": "calipers",
"callisthenics": "calisthenics",
"canalise": "canalize",
"canalised": "canalized",
"canalises": "canalizes",
"canalising": "canalizing",
"cancelation": "cancellation",
"cancelations": "cancellations",
"cancelled": "canceled",
"cancelling": "canceling",
"candour": "candor",
"cannibalise": "cannibalize",
"cannibalised": "cannibalized",
"cannibalises": "cannibalizes",
"cannibalising": "cannibalizing",
"canonise": "canonize",
"canonised": "canonized",
"canonises": "canonizes",
"canonising": "canonizing",
"capitalise": "capitalize",
"capitalised": "capitalized",
"capitalises": "capitalizes",
"capitalising": "capitalizing",
"caramelise": "caramelize",
"caramelised": "caramelized",
"caramelises": "caramelizes",
"caramelising": "caramelizing",
"carbonise": "carbonize",
"carbonised": "carbonized",
"carbonises": "carbonizes",
"carbonising": "carbonizing",
"carolled": "caroled",
"carolling": "caroling",
"catalogue": "catalog",
"catalogued": "cataloged",
"catalogues": "catalogs",
"cataloguing": "cataloging",
"catalyse": "catalyze",
"catalysed": "catalyzed",
"catalyses": "catalyzes",
"catalysing": "catalyzing",
"categorise": "categorize",
"categorised": "categorized",
"categorises": "categorizes",
"categorising": "categorizing",
"cauterise": "cauterize",
"cauterised": "cauterized",
"cauterises": "cauterizes",
"cauterising": "cauterizing",
"cavilled": "caviled",
"cavilling": "caviling",
"centigramme": "centigram",
"centigrammes": "centigrams",
"centilitre": "centiliter",
"centilitres": "centiliters",
"centimetre": "centimeter",
"centimetres": "centimeters",
"centralise": "centralize",
"centralised": "centralized",
"centralises": "centralizes",
"centralising": "centralizing",
"centre": "center",
"centred": "centered",
"centrefold": "centerfold",
"centrefolds": "centerfolds",
"centrepiece": "centerpiece",
"centrepieces": "centerpieces",
"centres": "centers",
"channelled": "channeled",
"channelling": "channeling",
"characterise": "characterize",
"characterised": "characterized",
"characterises": "characterizes",
"characterising": "characterizing",
"cheque": "check",
"chequebook": "checkbook",
"chequebooks": "checkbooks",
"chequered": "checkered",
"cheques": "checks",
"chilli": "chili",
"chimaera": "chimera",
"chimaeras": "chimeras",
"chiselled": "chiseled",
"chiselling": "chiseling",
"circularise": "circularize",
"circularised": "circularized",
"circularises": "circularizes",
"circularising": "circularizing",
"civilise": "civilize",
"civilised": "civilized",
"civilises": "civilizes",
"civilising": "civilizing",
"clamour": "clamor",
"clamoured": "clamored",
"clamouring": "clamoring",
"clamours": "clamors",
"clangour": "clangor",
"clarinettist": "clarinetist",
"clarinettists": "clarinetists",
"collectivise": "collectivize",
"collectivised": "collectivized",
"collectivises": "collectivizes",
"collectivising": "collectivizing",
"colonisation": "colonization",
"colonise": "colonize",
"colonised": "colonized",
"coloniser": "colonizer",
"colonisers": "colonizers",
"colonises": "colonizes",
"colonising": "colonizing",
"colour": "color",
"colourant": "colorant",
"colourants": "colorants",
"coloured": "colored",
"coloureds": "coloreds",
"colourful": "colorful",
"colourfully": "colorfully",
"colouring": "coloring",
"colourize": "colorize",
"colourized": "colorized",
"colourizes": "colorizes",
"colourizing": "colorizing",
"colourless": "colorless",
"colours": "colors",
"commercialise": "commercialize",
"commercialised": "commercialized",
"commercialises": "commercializes",
"commercialising": "commercializing",
"compartmentalise": "compartmentalize",
"compartmentalised": "compartmentalized",
"compartmentalises": "compartmentalizes",
"compartmentalising": "compartmentalizing",
"computerise": "computerize",
"computerised": "computerized",
"computerises": "computerizes",
"computerising": "computerizing",
"conceptualise": "conceptualize",
"conceptualised": "conceptualized",
"conceptualises": "conceptualizes",
"conceptualising": "conceptualizing",
"connexion": "connection",
"connexions": "connections",
"contextualise": "contextualize",
"contextualised": "contextualized",
"contextualises": "contextualizes",
"contextualising": "contextualizing",
"cosier": "cozier",
"cosies": "cozies",
"cosiest": "coziest",
"cosily": "cozily",
"cosiness": "coziness",
"cosy": "cozy",
"councillor": "councilor",
"councillors": "councilors",
"counselled": "counseled",
"counselling": "counseling",
"counsellor": "counselor",
"counsellors": "counselors",
"crenelated": "crenellated",
"criminalise": "criminalize",
"criminalised": "criminalized",
"criminalises": "criminalizes",
"criminalising": "criminalizing",
"criticise": "criticize",
"criticised": "criticized",
"criticises": "criticizes",
"criticising": "criticizing",
"crueller": "crueler",
"cruellest": "cruelest",
"crystallisation": "crystallization",
"crystallise": "crystallize",
"crystallised": "crystallized",
"crystallises": "crystallizes",
"crystallising": "crystallizing",
"cudgelled": "cudgeled",
"cudgelling": "cudgeling",
"customise": "customize",
"customised": "customized",
"customises": "customizes",
"customising": "customizing",
"cypher": "cipher",
"cyphers": "ciphers",
"decentralisation": "decentralization",
"decentralise": "decentralize",
"decentralised": "decentralized",
"decentralises": "decentralizes",
"decentralising": "decentralizing",
"decriminalisation": "decriminalization",
"decriminalise": "decriminalize",
"decriminalised": "decriminalized",
"decriminalises": "decriminalizes",
"decriminalising": "decriminalizing",
"defence": "defense",
"defenceless": "defenseless",
"defences": "defenses",
"dehumanisation": "dehumanization",
"dehumanise": "dehumanize",
"dehumanised": "dehumanized",
"dehumanises": "dehumanizes",
"dehumanising": "dehumanizing",
"demeanour": "demeanor",
"demilitarisation": "demilitarization",
"demilitarise": "demilitarize",
"demilitarised": "demilitarized",
"demilitarises": "demilitarizes",
"demilitarising": "demilitarizing",
"demobilisation": "demobilization",
"demobilise": "demobilize",
"demobilised": "demobilized",
"demobilises": "demobilizes",
"demobilising": "demobilizing",
"democratisation": "democratization",
"democratise": "democratize",
"democratised": "democratized",
"democratises": "democratizes",
"democratising": "democratizing",
"demonise": "demonize",
"demonised": "demonized",
"demonises": "demonizes",
"demonising": "demonizing",
"demoralisation": "demoralization",
"demoralise": "demoralize",
"demoralised": "demoralized",
"demoralises": "demoralizes",
"demoralising": "demoralizing",
"denationalisation": "denationalization",
"denationalise": "denationalize",
"denationalised": "denationalized",
"denationalises": "denationalizes",
"denationalising": "denationalizing",
"deodorise": "deodorize",
"deodorised": "deodorized",
"deodorises": "deodorizes",
"deodorising": "deodorizing",
"depersonalise": "depersonalize",
"depersonalised": "depersonalized",
"depersonalises": "depersonalizes",
"depersonalising": "depersonalizing",
"deputise": "deputize",
"deputised": "deputized",
"deputises": "deputizes",
"deputising": "deputizing",
"desensitisation": "desensitization",
"desensitise": "desensitize",
"desensitised": "desensitized",
"desensitises": "desensitizes",
"desensitising": "desensitizing",
"destabilisation": "destabilization",
"destabilise": "destabilize",
"destabilised": "destabilized",
"destabilises": "destabilizes",
"destabilising": "destabilizing",
"dialled": "dialed",
"dialling": "dialing",
"dialogue": "dialog",
"dialogues": "dialogs",
"diarrhoea": "diarrhea",
"digitise": "digitize",
"digitised": "digitized",
"digitises": "digitizes",
"digitising": "digitizing",
"disc": "disk",
"discolour": "discolor",
"discoloured": "discolored",
"discolouring": "discoloring",
"discolours": "discolors",
"discs": "disks",
"disembowelled": "disemboweled",
"disembowelling": "disemboweling",
"disfavour": "disfavor",
"dishevelled": "disheveled",
"dishonour": "dishonor",
"dishonourable": "dishonorable",
"dishonourably": "dishonorably",
"dishonoured": "dishonored",
"dishonouring": "dishonoring",
"dishonours": "dishonors",
"disorganisation": "disorganization",
"disorganised": "disorganized",
"distil": "distill",
"distils": "distills",
"dramatisation": "dramatization",
"dramatisations": "dramatizations",
"dramatise": "dramatize",
"dramatised": "dramatized",
"dramatises": "dramatizes",
"dramatising": "dramatizing",
"draught": "draft",
"draughtboard": "draftboard",
"draughtboards": "draftboards",
"draughtier": "draftier",
"draughtiest": "draftiest",
"draughts": "drafts",
"draughtsman": "draftsman",
"draughtsmanship": "draftsmanship",
"draughtsmen": "draftsmen",
"draughtswoman": "draftswoman",
"draughtswomen": "draftswomen",
"draughty": "drafty",
"drivelled": "driveled",
"drivelling": "driveling",
"duelled": "dueled",
"duelling": "dueling",
"economise": "economize",
"economised": "economized",
"economises": "economizes",
"economising": "economizing",
"edoema": "edema",
"editorialise": "editorialize",
"editorialised": "editorialized",
"editorialises": "editorializes",
"editorialising": "editorializing",
"empathise": "empathize",
"empathised": "empathized",
"empathises": "empathizes",
"empathising": "empathizing",
"emphasise": "emphasize",
"emphasised": "emphasized",
"emphasises": "emphasizes",
"emphasising": "emphasizing",
"enamelled": "enameled",
"enamelling": "enameling",
"enamoured": "enamored",
"encyclopaedia": "encyclopedia",
"encyclopaedias": "encyclopedias",
"encyclopaedic": "encyclopedic",
"endeavour": "endeavor",
"endeavoured": "endeavored",
"endeavouring": "endeavoring",
"endeavours": "endeavors",
"energise": "energize",
"energised": "energized",
"energises": "energizes",
"energising": "energizing",
"enrol": "enroll",
"enrols": "enrolls",
"enthral": "enthrall",
"enthrals": "enthralls",
"epaulette": "epaulet",
"epaulettes": "epaulets",
"epicentre": "epicenter",
"epicentres": "epicenters",
"epilogue": "epilog",
"epilogues": "epilogs",
"epitomise": "epitomize",
"epitomised": "epitomized",
"epitomises": "epitomizes",
"epitomising": "epitomizing",
"equalisation": "equalization",
"equalise": "equalize",
"equalised": "equalized",
"equaliser": "equalizer",
"equalisers": "equalizers",
"equalises": "equalizes",
"equalising": "equalizing",
"eulogise": "eulogize",
"eulogised": "eulogized",
"eulogises": "eulogizes",
"eulogising": "eulogizing",
"evangelise": "evangelize",
"evangelised": "evangelized",
"evangelises": "evangelizes",
"evangelising": "evangelizing",
"exorcise": "exorcize",
"exorcised": "exorcized",
"exorcises": "exorcizes",
"exorcising": "exorcizing",
"extemporisation": "extemporization",
"extemporise": "extemporize",
"extemporised": "extemporized",
"extemporises": "extemporizes",
"extemporising": "extemporizing",
"externalisation": "externalization",
"externalisations": "externalizations",
"externalise": "externalize",
"externalised": "externalized",
"externalises": "externalizes",
"externalising": "externalizing",
"factorise": "factorize",
"factorised": "factorized",
"factorises": "factorizes",
"factorising": "factorizing",
"faecal": "fecal",
"faeces": "feces",
"familiarisation": "familiarization",
"familiarise": "familiarize",
"familiarised": "familiarized",
"familiarises": "familiarizes",
"familiarising": "familiarizing",
"fantasise": "fantasize",
"fantasised": "fantasized",
"fantasises": "fantasizes",
"fantasising": "fantasizing",
"favour": "favor",
"favourable": "favorable",
"favourably": "favorably",
"favoured": "favored",
"favouring": "favoring",
"favourite": "favorite",
"favourites": "favorites",
"favouritism": "favoritism",
"favours": "favors",
"feminise": "feminize",
"feminised": "feminized",
"feminises": "feminizes",
"feminising": "feminizing",
"fertilisation": "fertilization",
"fertilise": "fertilize",
"fertilised": "fertilized",
"fertiliser": "fertilizer",
"fertilisers": "fertilizers",
"fertilises": "fertilizes",
"fertilising": "fertilizing",
"fervour": "fervor",
"fibre": "fiber",
"fibreglass": "fiberglass",
"fibres": "fibers",
"fictionalisation": "fictionalization",
"fictionalisations": "fictionalizations",
"fictionalise": "fictionalize",
"fictionalised": "fictionalized",
"fictionalises": "fictionalizes",
"fictionalising": "fictionalizing",
"fillet": "filet",
"filleted": "fileted",
"filleting": "fileting",
"fillets": "filets",
"finalisation": "finalization",
"finalise": "finalize",
"finalised": "finalized",
"finalises": "finalizes",
"finalising": "finalizing",
"flautist": "flutist",
"flautists": "flutists",
"flavour": "flavor",
"flavoured": "flavored",
"flavouring": "flavoring",
"flavourings": "flavorings",
"flavourless": "flavorless",
"flavours": "flavors",
"flavoursome": "flavorsome",
"flyer / flier": "flier / flyer",
"foetal": "fetal",
"foetid": "fetid",
"foetus": "fetus",
"foetuses": "fetuses",
"formalisation": "formalization",
"formalise": "formalize",
"formalised": "formalized",
"formalises": "formalizes",
"formalising": "formalizing",
"fossilisation": "fossilization",
"fossilise": "fossilize",
"fossilised": "fossilized",
"fossilises": "fossilizes",
"fossilising": "fossilizing",
"fraternisation": "fraternization",
"fraternise": "fraternize",
"fraternised": "fraternized",
"fraternises": "fraternizes",
"fraternising": "fraternizing",
"fulfil": "fulfill",
"fulfilment": "fulfillment",
"fulfils": "fulfills",
"funnelled": "funneled",
"funnelling": "funneling",
"galvanise": "galvanize",
"galvanised": "galvanized",
"galvanises": "galvanizes",
"galvanising": "galvanizing",
"gambolled": "gamboled",
"gambolling": "gamboling",
"gaol": "jail",
"gaolbird": "jailbird",
"gaolbirds": "jailbirds",
"gaolbreak": "jailbreak",
"gaolbreaks": "jailbreaks",
"gaoled": "jailed",
"gaoler": "jailer",
"gaolers": "jailers",
"gaoling": "jailing",
"gaols": "jails",
"gasses": "gases",
"gage": "gauge",
"gaged": "gauged",
"gages": "gauges",
"gaging": "gauging",
"generalisation": "generalization",
"generalisations": "generalizations",
"generalise": "generalize",
"generalised": "generalized",
"generalises": "generalizes",
"generalising": "generalizing",
"ghettoise": "ghettoize",
"ghettoised": "ghettoized",
"ghettoises": "ghettoizes",
"ghettoising": "ghettoizing",
"gipsies": "gypsies",
"glamorise": "glamorize",
"glamorised": "glamorized",
"glamorises": "glamorizes",
"glamorising": "glamorizing",
"glamor": "glamour",
"globalisation": "globalization",
"globalise": "globalize",
"globalised": "globalized",
"globalises": "globalizes",
"globalising": "globalizing",
"glueing": "gluing",
"goitre": "goiter",
"goitres": "goiters",
"gonorrhoea": "gonorrhea",
"gramme": "gram",
"grammes": "grams",
"gravelled": "graveled",
"grey": "gray",
"greyed": "grayed",
"greying": "graying",
"greyish": "grayish",
"greyness": "grayness",
"greys": "grays",
"grovelled": "groveled",
"grovelling": "groveling",
"groyne": "groin",
"groynes": "groins",
"gruelling": "grueling",
"gruellingly": "gruelingly",
"gryphon": "griffin",
"gryphons": "griffins",
"gynaecological": "gynecological",
"gynaecologist": "gynecologist",
"gynaecologists": "gynecologists",
"gynaecology": "gynecology",
"haematological": "hematological",
"haematologist": "hematologist",
"haematologists": "hematologists",
"haematology": "hematology",
"haemoglobin": "hemoglobin",
"haemophilia": "hemophilia",
"haemophiliac": "hemophiliac",
"haemophiliacs": "hemophiliacs",
"haemorrhage": "hemorrhage",
"haemorrhaged": "hemorrhaged",
"haemorrhages": "hemorrhages",
"haemorrhaging": "hemorrhaging",
"haemorrhoids": "hemorrhoids",
"harbour": "harbor",
"harboured": "harbored",
"harbouring": "harboring",
"harbours": "harbors",
"harmonisation": "harmonization",
"harmonise": "harmonize",
"harmonised": "harmonized",
"harmonises": "harmonizes",
"harmonising": "harmonizing",
"homoeopath": "homeopath",
"homoeopathic": "homeopathic",
"homoeopaths": "homeopaths",
"homoeopathy": "homeopathy",
"homogenise": "homogenize",
"homogenised": "homogenized",
"homogenises": "homogenizes",
"homogenising": "homogenizing",
"honour": "honor",
"honourable": "honorable",
"honourably": "honorably",
"honoured": "honored",
"honouring": "honoring",
"honours": "honors",
"hospitalisation": "hospitalization",
"hospitalise": "hospitalize",
"hospitalised": "hospitalized",
"hospitalises": "hospitalizes",
"hospitalising": "hospitalizing",
"humanise": "humanize",
"humanised": "humanized",
"humanises": "humanizes",
"humanising": "humanizing",
"humour": "humor",
"humoured": "humored",
"humouring": "humoring",
"humourless": "humorless",
"humours": "humors",
"hybridise": "hybridize",
"hybridised": "hybridized",
"hybridises": "hybridizes",
"hybridising": "hybridizing",
"hypnotise": "hypnotize",
"hypnotised": "hypnotized",
"hypnotises": "hypnotizes",
"hypnotising": "hypnotizing",
"hypothesise": "hypothesize",
"hypothesised": "hypothesized",
"hypothesises": "hypothesizes",
"hypothesising": "hypothesizing",
"idealisation": "idealization",
"idealise": "idealize",
"idealised": "idealized",
"idealises": "idealizes",
"idealising": "idealizing",
"idolise": "idolize",
"idolised": "idolized",
"idolises": "idolizes",
"idolising": "idolizing",
"immobilisation": "immobilization",
"immobilise": "immobilize",
"immobilised": "immobilized",
"immobiliser": "immobilizer",
"immobilisers": "immobilizers",
"immobilises": "immobilizes",
"immobilising": "immobilizing",
"immortalise": "immortalize",
"immortalised": "immortalized",
"immortalises": "immortalizes",
"immortalising": "immortalizing",
"immunisation": "immunization",
"immunise": "immunize",
"immunised": "immunized",
"immunises": "immunizes",
"immunising": "immunizing",
"impanelled": "impaneled",
"impanelling": "impaneling",
"imperilled": "imperiled",
"imperilling": "imperiling",
"individualise": "individualize",
"individualised": "individualized",
"individualises": "individualizes",
"individualising": "individualizing",
"industrialise": "industrialize",
"industrialised": "industrialized",
"industrialises": "industrializes",
"industrialising": "industrializing",
"inflexion": "inflection",
"inflexions": "inflections",
"initialise": "initialize",
"initialised": "initialized",
"initialises": "initializes",
"initialising": "initializing",
"initialled": "initialed",
"initialling": "initialing",
"instal": "install",
"instalment": "installment",
"instalments": "installments",
"instals": "installs",
"instil": "instill",
"instils": "instills",
"institutionalisation": "institutionalization",
"institutionalise": "institutionalize",
"institutionalised": "institutionalized",
"institutionalises": "institutionalizes",
"institutionalising": "institutionalizing",
"intellectualise": "intellectualize",
"intellectualised": "intellectualized",
"intellectualises": "intellectualizes",
"intellectualising": "intellectualizing",
"internalisation": "internalization",
"internalise": "internalize",
"internalised": "internalized",
"internalises": "internalizes",
"internalising": "internalizing",
"internationalisation": "internationalization",
"internationalise": "internationalize",
"internationalised": "internationalized",
"internationalises": "internationalizes",
"internationalising": "internationalizing",
"ionisation": "ionization",
"ionise": "ionize",
"ionised": "ionized",
"ioniser": "ionizer",
"ionisers": "ionizers",
"ionises": "ionizes",
"ionising": "ionizing",
"italicise": "italicize",
"italicised": "italicized",
"italicises": "italicizes",
"italicising": "italicizing",
"itemise": "itemize",
"itemised": "itemized",
"itemises": "itemizes",
"itemising": "itemizing",
"jeopardise": "jeopardize",
"jeopardised": "jeopardized",
"jeopardises": "jeopardizes",
"jeopardising": "jeopardizing",
"jewelled": "jeweled",
"jeweller": "jeweler",
"jewellers": "jewelers",
"jewellery": "jewelry",
"judgement": "judgment",
"kilogramme": "kilogram",
"kilogrammes": "kilograms",
"kilometre": "kilometer",
"kilometres": "kilometers",
"labelled": "labeled",
"labelling": "labeling",
"labour": "labor",
"laboured": "labored",
"labourer": "laborer",
"labourers": "laborers",
"labouring": "laboring",
"labours": "labors",
"lacklustre": "lackluster",
"legalisation": "legalization",
"legalise": "legalize",
"legalised": "legalized",
"legalises": "legalizes",
"legalising": "legalizing",
"legitimise": "legitimize",
"legitimised": "legitimized",
"legitimises": "legitimizes",
"legitimising": "legitimizing",
"leukaemia": "leukemia",
"levelled": "leveled",
"leveller": "leveler",
"levellers": "levelers",
"levelling": "leveling",
"libelled": "libeled",
"libelling": "libeling",
"libellous": "libelous",
"liberalisation": "liberalization",
"liberalise": "liberalize",
"liberalised": "liberalized",
"liberalises": "liberalizes",
"liberalising": "liberalizing",
"licence": "license",
"licenced": "licensed",
"licences": "licenses",
"licencing": "licensing",
"likeable": "likable",
"lionisation": "lionization",
"lionise": "lionize",
"lionised": "lionized",
"lionises": "lionizes",
"lionising": "lionizing",
"liquidise": "liquidize",
"liquidised": "liquidized",
"liquidiser": "liquidizer",
"liquidisers": "liquidizers",
"liquidises": "liquidizes",
"liquidising": "liquidizing",
"litre": "liter",
"litres": "liters",
"localise": "localize",
"localised": "localized",
"localises": "localizes",
"localising": "localizing",
"louvre": "louver",
"louvred": "louvered",
"louvres": "louvers",
"lustre": "luster",
"magnetise": "magnetize",
"magnetised": "magnetized",
"magnetises": "magnetizes",
"magnetising": "magnetizing",
"manoeuvrability": "maneuverability",
"manoeuvrable": "maneuverable",
"manoeuvre": "maneuver",
"manoeuvred": "maneuvered",
"manoeuvres": "maneuvers",
"manoeuvring": "maneuvering",
"manoeuvrings": "maneuverings",
"marginalisation": "marginalization",
"marginalise": "marginalize",
"marginalised": "marginalized",
"marginalises": "marginalizes",
"marginalising": "marginalizing",
"marshalled": "marshaled",
"marshalling": "marshaling",
"marvelled": "marveled",
"marvelling": "marveling",
"marvellous": "marvelous",
"marvellously": "marvelously",
"materialisation": "materialization",
"materialise": "materialize",
"materialised": "materialized",
"materialises": "materializes",
"materialising": "materializing",
"maximisation": "maximization",
"maximise": "maximize",
"maximised": "maximized",
"maximises": "maximizes",
"maximising": "maximizing",
"meagre": "meager",
"mechanisation": "mechanization",
"mechanise": "mechanize",
"mechanised": "mechanized",
"mechanises": "mechanizes",
"mechanising": "mechanizing",
"mediaeval": "medieval",
"memorialise": "memorialize",
"memorialised": "memorialized",
"memorialises": "memorializes",
"memorialising": "memorializing",
"memorise": "memorize",
"memorised": "memorized",
"memorises": "memorizes",
"memorising": "memorizing",
"mesmerise": "mesmerize",
"mesmerised": "mesmerized",
"mesmerises": "mesmerizes",
"mesmerising": "mesmerizing",
"metabolise": "metabolize",
"metabolised": "metabolized",
"metabolises": "metabolizes",
"metabolising": "metabolizing",
"metre": "meter",
"metres": "meters",
"micrometre": "micrometer",
"micrometres": "micrometers",
"militarise": "militarize",
"militarised": "militarized",
"militarises": "militarizes",
"militarising": "militarizing",
"milligramme": "milligram",
"milligrammes": "milligrams",
"millilitre": "milliliter",
"millilitres": "milliliters",
"millimetre": "millimeter",
"millimetres": "millimeters",
"miniaturisation": "miniaturization",
"miniaturise": "miniaturize",
"miniaturised": "miniaturized",
"miniaturises": "miniaturizes",
"miniaturising": "miniaturizing",
"minibusses": "minibuses",
"minimise": "minimize",
"minimised": "minimized",
"minimises": "minimizes",
"minimising": "minimizing",
"misbehaviour": "misbehavior",
"misdemeanour": "misdemeanor",
"misdemeanours": "misdemeanors",
"misspelt": "misspelled",
"mitre": "miter",
"mitres": "miters",
"mobilisation": "mobilization",
"mobilise": "mobilize",
"mobilised": "mobilized",
"mobilises": "mobilizes",
"mobilising": "mobilizing",
"modelled": "modeled",
"modeller": "modeler",
"modellers": "modelers",
"modelling": "modeling",
"modernise": "modernize",
"modernised": "modernized",
"modernises": "modernizes",
"modernising": "modernizing",
"moisturise": "moisturize",
"moisturised": "moisturized",
"moisturiser": "moisturizer",
"moisturisers": "moisturizers",
"moisturises": "moisturizes",
"moisturising": "moisturizing",
"monologue": "monolog",
"monologues": "monologs",
"monopolisation": "monopolization",
"monopolise": "monopolize",
"monopolised": "monopolized",
"monopolises": "monopolizes",
"monopolising": "monopolizing",
"moralise": "moralize",
"moralised": "moralized",
"moralises": "moralizes",
"moralising": "moralizing",
"motorised": "motorized",
"mould": "mold",
"moulded": "molded",
"moulder": "molder",
"mouldered": "moldered",
"mouldering": "moldering",
"moulders": "molders",
"mouldier": "moldier",
"mouldiest": "moldiest",
"moulding": "molding",
"mouldings": "moldings",
"moulds": "molds",
"mouldy": "moldy",
"moult": "molt",
"moulted": "molted",
"moulting": "molting",
"moults": "molts",
"moustache": "mustache",
"moustached": "mustached",
"moustaches": "mustaches",
"moustachioed": "mustachioed",
"multicoloured": "multicolored",
"nationalisation": "nationalization",
"nationalisations": "nationalizations",
"nationalise": "nationalize",
"nationalised": "nationalized",
"nationalises": "nationalizes",
"nationalising": "nationalizing",
"naturalisation": "naturalization",
"naturalise": "naturalize",
"naturalised": "naturalized",
"naturalises": "naturalizes",
"naturalising": "naturalizing",
"neighbour": "neighbor",
"neighbourhood": "neighborhood",
"neighbourhoods": "neighborhoods",
"neighbouring": "neighboring",
"neighbourliness": "neighborliness",
"neighbourly": "neighborly",
"neighbours": "neighbors",
"neutralisation": "neutralization",
"neutralise": "neutralize",
"neutralised": "neutralized",
"neutralises": "neutralizes",
"neutralising": "neutralizing",
"normalisation": "normalization",
"normalise": "normalize",
"normalised": "normalized",
"normalises": "normalizes",
"normalising": "normalizing",
"odour": "odor",
"odourless": "odorless",
"odours": "odors",
"oesophagus": "esophagus",
"oesophaguses": "esophaguses",
"oestrogen": "estrogen",
"offence": "offense",
"offences": "offenses",
"omelette": "omelet",
"omelettes": "omelets",
"optimise": "optimize",
"optimised": "optimized",
"optimises": "optimizes",
"optimising": "optimizing",
"organisation": "organization",
"organisational": "organizational",
"organisations": "organizations",
"organise": "organize",
"organised": "organized",
"organiser": "organizer",
"organisers": "organizers",
"organises": "organizes",
"organising": "organizing",
"orthopaedic": "orthopedic",
"orthopaedics": "orthopedics",
"ostracise": "ostracize",
"ostracised": "ostracized",
"ostracises": "ostracizes",
"ostracising": "ostracizing",
"outmanoeuvre": "outmaneuver",
"outmanoeuvred": "outmaneuvered",
"outmanoeuvres": "outmaneuvers",
"outmanoeuvring": "outmaneuvering",
"overemphasise": "overemphasize",
"overemphasised": "overemphasized",
"overemphasises": "overemphasizes",
"overemphasising": "overemphasizing",
"oxidisation": "oxidization",
"oxidise": "oxidize",
"oxidised": "oxidized",
"oxidises": "oxidizes",
"oxidising": "oxidizing",
"paederast": "pederast",
"paederasts": "pederasts",
"paediatric": "pediatric",
"paediatrician": "pediatrician",
"paediatricians": "pediatricians",
"paediatrics": "pediatrics",
"paedophile": "pedophile",
"paedophiles": "pedophiles",
"paedophilia": "pedophilia",
"palaeolithic": "paleolithic",
"palaeontologist": "paleontologist",
"palaeontologists": "paleontologists",
"palaeontology": "paleontology",
"panelled": "paneled",
"panelling": "paneling",
"panellist": "panelist",
"panellists": "panelists",
"paralyse": "paralyze",
"paralysed": "paralyzed",
"paralyses": "paralyzes",
"paralysing": "paralyzing",
"parcelled": "parceled",
"parcelling": "parceling",
"parlour": "parlor",
"parlours": "parlors",
"particularise": "particularize",
"particularised": "particularized",
"particularises": "particularizes",
"particularising": "particularizing",
"passivisation": "passivization",
"passivise": "passivize",
"passivised": "passivized",
"passivises": "passivizes",
"passivising": "passivizing",
"pasteurisation": "pasteurization",
"pasteurise": "pasteurize",
"pasteurised": "pasteurized",
"pasteurises": "pasteurizes",
"pasteurising": "pasteurizing",
"patronise": "patronize",
"patronised": "patronized",
"patronises": "patronizes",
"patronising": "patronizing",
"patronisingly": "patronizingly",
"pedalled": "pedaled",
"pedalling": "pedaling",
"pedestrianisation": "pedestrianization",
"pedestrianise": "pedestrianize",
"pedestrianised": "pedestrianized",
"pedestrianises": "pedestrianizes",
"pedestrianising": "pedestrianizing",
"penalise": "penalize",
"penalised": "penalized",
"penalises": "penalizes",
"penalising": "penalizing",
"pencilled": "penciled",
"pencilling": "penciling",
"personalise": "personalize",
"personalised": "personalized",
"personalises": "personalizes",
"personalising": "personalizing",
"pharmacopoeia": "pharmacopeia",
"pharmacopoeias": "pharmacopeias",
"philosophise": "philosophize",
"philosophised": "philosophized",
"philosophises": "philosophizes",
"philosophising": "philosophizing",
"philtre": "filter",
"philtres": "filters",
"phoney": "phony",
"plagiarise": "plagiarize",
"plagiarised": "plagiarized",
"plagiarises": "plagiarizes",
"plagiarising": "plagiarizing",
"plough": "plow",
"ploughed": "plowed",
"ploughing": "plowing",
"ploughman": "plowman",
"ploughmen": "plowmen",
"ploughs": "plows",
"ploughshare": "plowshare",
"ploughshares": "plowshares",
"polarisation": "polarization",
"polarise": "polarize",
"polarised": "polarized",
"polarises": "polarizes",
"polarising": "polarizing",
"politicisation": "politicization",
"politicise": "politicize",
"politicised": "politicized",
"politicises": "politicizes",
"politicising": "politicizing",
"popularisation": "popularization",
"popularise": "popularize",
"popularised": "popularized",
"popularises": "popularizes",
"popularising": "popularizing",
"pouffe": "pouf",
"pouffes": "poufs",
"practise": "practice",
"practised": "practiced",
"practises": "practices",
"practising": "practicing",
"praesidium": "presidium",
"praesidiums": "presidiums",
"pressurisation": "pressurization",
"pressurise": "pressurize",
"pressurised": "pressurized",
"pressurises": "pressurizes",
"pressurising": "pressurizing",
"pretence": "pretense",
"pretences": "pretenses",
"primaeval": "primeval",
"prioritisation": "prioritization",
"prioritise": "prioritize",
"prioritised": "prioritized",
"prioritises": "prioritizes",
"prioritising": "prioritizing",
"privatisation": "privatization",
"privatisations": "privatizations",
"privatise": "privatize",
"privatised": "privatized",
"privatises": "privatizes",
"privatising": "privatizing",
"professionalisation": "professionalization",
"professionalise": "professionalize",
"professionalised": "professionalized",
"professionalises": "professionalizes",
"professionalising": "professionalizing",
"programme": "program",
"programmes": "programs",
"prologue": "prolog",
"prologues": "prologs",
"propagandise": "propagandize",
"propagandised": "propagandized",
"propagandises": "propagandizes",
"propagandising": "propagandizing",
"proselytise": "proselytize",
"proselytised": "proselytized",
"proselytiser": "proselytizer",
"proselytisers": "proselytizers",
"proselytises": "proselytizes",
"proselytising": "proselytizing",
"psychoanalyse": "psychoanalyze",
"psychoanalysed": "psychoanalyzed",
"psychoanalyses": "psychoanalyzes",
"psychoanalysing": "psychoanalyzing",
"publicise": "publicize",
"publicised": "publicized",
"publicises": "publicizes",
"publicising": "publicizing",
"pulverisation": "pulverization",
"pulverise": "pulverize",
"pulverised": "pulverized",
"pulverises": "pulverizes",
"pulverising": "pulverizing",
"pummelled": "pummel",
"pummelling": "pummeled",
"pyjama": "pajama",
"pyjamas": "pajamas",
"pzazz": "pizzazz",
"quarrelled": "quarreled",
"quarrelling": "quarreling",
"radicalise": "radicalize",
"radicalised": "radicalized",
"radicalises": "radicalizes",
"radicalising": "radicalizing",
"rancour": "rancor",
"randomise": "randomize",
"randomised": "randomized",
"randomises": "randomizes",
"randomising": "randomizing",
"rationalisation": "rationalization",
"rationalisations": "rationalizations",
"rationalise": "rationalize",
"rationalised": "rationalized",
"rationalises": "rationalizes",
"rationalising": "rationalizing",
"ravelled": "raveled",
"ravelling": "raveling",
"realisable": "realizable",
"realisation": "realization",
"realisations": "realizations",
"realise": "realize",
"realised": "realized",
"realises": "realizes",
"realising": "realizing",
"recognisable": "recognizable",
"recognisably": "recognizably",
"recognisance": "recognizance",
"recognise": "recognize",
"recognised": "recognized",
"recognises": "recognizes",
"recognising": "recognizing",
"reconnoitre": "reconnoiter",
"reconnoitred": "reconnoitered",
"reconnoitres": "reconnoiters",
"reconnoitring": "reconnoitering",
"refuelled": "refueled",
"refuelling": "refueling",
"regularisation": "regularization",
"regularise": "regularize",
"regularised": "regularized",
"regularises": "regularizes",
"regularising": "regularizing",
"remodelled": "remodeled",
"remodelling": "remodeling",
"remould": "remold",
"remoulded": "remolded",
"remoulding": "remolding",
"remoulds": "remolds",
"reorganisation": "reorganization",
"reorganisations": "reorganizations",
"reorganise": "reorganize",
"reorganised": "reorganized",
"reorganises": "reorganizes",
"reorganising": "reorganizing",
"revelled": "reveled",
"reveller": "reveler",
"revellers": "revelers",
"revelling": "reveling",
"revitalise": "revitalize",
"revitalised": "revitalized",
"revitalises": "revitalizes",
"revitalising": "revitalizing",
"revolutionise": "revolutionize",
"revolutionised": "revolutionized",
"revolutionises": "revolutionizes",
"revolutionising": "revolutionizing",
"rhapsodise": "rhapsodize",
"rhapsodised": "rhapsodized",
"rhapsodises": "rhapsodizes",
"rhapsodising": "rhapsodizing",
"rigour": "rigor",
"rigours": "rigors",
"ritualised": "ritualized",
"rivalled": "rivaled",
"rivalling": "rivaling",
"romanticise": "romanticize",
"romanticised": "romanticized",
"romanticises": "romanticizes",
"romanticising": "romanticizing",
"rumour": "rumor",
"rumoured": "rumored",
"rumours": "rumors",
"sabre": "saber",
"sabres": "sabers",
"saltpetre": "saltpeter",
"sanitise": "sanitize",
"sanitised": "sanitized",
"sanitises": "sanitizes",
"sanitising": "sanitizing",
"satirise": "satirize",
"satirised": "satirized",
"satirises": "satirizes",
"satirising": "satirizing",
"saviour": "savior",
"saviours": "saviors",
"savour": "savor",
"savoured": "savored",
"savouries": "savories",
"savouring": "savoring",
"savours": "savors",
"savoury": "savory",
"scandalise": "scandalize",
"scandalised": "scandalized",
"scandalises": "scandalizes",
"scandalising": "scandalizing",
"sceptic": "skeptic",
"sceptical": "skeptical",
"sceptically": "skeptically",
"scepticism": "skepticism",
"sceptics": "skeptics",
"sceptre": "scepter",
"sceptres": "scepters",
"scrutinise": "scrutinize",
"scrutinised": "scrutinized",
"scrutinises": "scrutinizes",
"scrutinising": "scrutinizing",
"secularisation": "secularization",
"secularise": "secularize",
"secularised": "secularized",
"secularises": "secularizes",
"secularising": "secularizing",
"sensationalise": "sensationalize",
"sensationalised": "sensationalized",
"sensationalises": "sensationalizes",
"sensationalising": "sensationalizing",
"sensitise": "sensitize",
"sensitised": "sensitized",
"sensitises": "sensitizes",
"sensitising": "sensitizing",
"sentimentalise": "sentimentalize",
"sentimentalised": "sentimentalized",
"sentimentalises": "sentimentalizes",
"sentimentalising": "sentimentalizing",
"sepulchre": "sepulcher",
"sepulchres": "sepulchers",
"serialisation": "serialization",
"serialisations": "serializations",
"serialise": "serialize",
"serialised": "serialized",
"serialises": "serializes",
"serialising": "serializing",
"sermonise": "sermonize",
"sermonised": "sermonized",
"sermonises": "sermonizes",
"sermonising": "sermonizing",
"sheikh": "sheik",
"shovelled": "shoveled",
"shovelling": "shoveling",
"shrivelled": "shriveled",
"shrivelling": "shriveling",
"signalise": "signalize",
"signalised": "signalized",
"signalises": "signalizes",
"signalising": "signalizing",
"signalled": "signaled",
"signalling": "signaling",
"smoulder": "smolder",
"smouldered": "smoldered",
"smouldering": "smoldering",
"smoulders": "smolders",
"snivelled": "sniveled",
"snivelling": "sniveling",
"snorkelled": "snorkeled",
"snorkelling": "snorkeling",
"snowplough": "snowplow",
"snowploughs": "snowplow",
"socialisation": "socialization",
"socialise": "socialize",
"socialised": "socialized",
"socialises": "socializes",
"socialising": "socializing",
"sodomise": "sodomize",
"sodomised": "sodomized",
"sodomises": "sodomizes",
"sodomising": "sodomizing",
"solemnise": "solemnize",
"solemnised": "solemnized",
"solemnises": "solemnizes",
"solemnising": "solemnizing",
"sombre": "somber",
"specialisation": "specialization",
"specialisations": "specializations",
"specialise": "specialize",
"specialised": "specialized",
"specialises": "specializes",
"specialising": "specializing",
"spectre": "specter",
"spectres": "specters",
"spiralled": "spiraled",
"spiralling": "spiraling",
"splendour": "splendor",
"splendours": "splendors",
"squirrelled": "squirreled",
"squirrelling": "squirreling",
"stabilisation": "stabilization",
"stabilise": "stabilize",
"stabilised": "stabilized",
"stabiliser": "stabilizer",
"stabilisers": "stabilizers",
"stabilises": "stabilizes",
"stabilising": "stabilizing",
"standardisation": "standardization",
"standardise": "standardize",
"standardised": "standardized",
"standardises": "standardizes",
"standardising": "standardizing",
"stencilled": "stenciled",
"stencilling": "stenciling",
"sterilisation": "sterilization",
"sterilisations": "sterilizations",
"sterilise": "sterilize",
"sterilised": "sterilized",
"steriliser": "sterilizer",
"sterilisers": "sterilizers",
"sterilises": "sterilizes",
"sterilising": "sterilizing",
"stigmatisation": "stigmatization",
"stigmatise": "stigmatize",
"stigmatised": "stigmatized",
"stigmatises": "stigmatizes",
"stigmatising": "stigmatizing",
"storey": "story",
"storeys": "stories",
"subsidisation": "subsidization",
"subsidise": "subsidize",
"subsidised": "subsidized",
"subsidiser": "subsidizer",
"subsidisers": "subsidizers",
"subsidises": "subsidizes",
"subsidising": "subsidizing",
"succour": "succor",
"succoured": "succored",
"succouring": "succoring",
"succours": "succors",
"sulphate": "sulfate",
"sulphates": "sulfates",
"sulphide": "sulfide",
"sulphides": "sulfides",
"sulphur": "sulfur",
"sulphurous": "sulfurous",
"summarise": "summarize",
"summarised": "summarized",
"summarises": "summarizes",
"summarising": "summarizing",
"swivelled": "swiveled",
"swivelling": "swiveling",
"symbolise": "symbolize",
"symbolised": "symbolized",
"symbolises": "symbolizes",
"symbolising": "symbolizing",
"sympathise": "sympathize",
"sympathised": "sympathized",
"sympathiser": "sympathizer",
"sympathisers": "sympathizers",
"sympathises": "sympathizes",
"sympathising": "sympathizing",
"synchronisation": "synchronization",
"synchronise": "synchronize",
"synchronised": "synchronized",
"synchronises": "synchronizes",
"synchronising": "synchronizing",
"synthesise": "synthesize",
"synthesised": "synthesized",
"synthesiser": "synthesizer",
"synthesisers": "synthesizers",
"synthesises": "synthesizes",
"synthesising": "synthesizing",
"syphon": "siphon",
"syphoned": "siphoned",
"syphoning": "siphoning",
"syphons": "siphons",
"systematisation": "systematization",
"systematise": "systematize",
"systematised": "systematized",
"systematises": "systematizes",
"systematising": "systematizing",
"tantalise": "tantalize",
"tantalised": "tantalized",
"tantalises": "tantalizes",
"tantalising": "tantalizing",
"tantalisingly": "tantalizingly",
"tasselled": "tasseled",
"technicolour": "technicolor",
"temporise": "temporize",
"temporised": "temporized",
"temporises": "temporizes",
"temporising": "temporizing",
"tenderise": "tenderize",
"tenderised": "tenderized",
"tenderises": "tenderizes",
"tenderising": "tenderizing",
"terrorise": "terrorize",
"terrorised": "terrorized",
"terrorises": "terrorizes",
"terrorising": "terrorizing",
"theatre": "theater",
"theatregoer": "theatergoer",
"theatregoers": "theatergoers",
"theatres": "theaters",
"theorise": "theorize",
"theorised": "theorized",
"theorises": "theorizes",
"theorising": "theorizing",
"tonne": "ton",
"tonnes": "tons",
"towelled": "toweled",
"towelling": "toweling",
"toxaemia": "toxemia",
"tranquillise": "tranquilize",
"tranquillised": "tranquilized",
"tranquilliser": "tranquilizer",
"tranquillisers": "tranquilizers",
"tranquillises": "tranquilizes",
"tranquillising": "tranquilizing",
"tranquillity": "tranquility",
"tranquillize": "tranquilize",
"tranquillized": "tranquilized",
"tranquillizer": "tranquilizer",
"tranquillizers": "tranquilizers",
"tranquillizes": "tranquilizes",
"tranquillizing": "tranquilizing",
"tranquilly": "tranquility",
"transistorised": "transistorized",
"traumatise": "traumatize",
"traumatised": "traumatized",
"traumatises": "traumatizes",
"traumatising": "traumatizing",
"travelled": "traveled",
"traveller": "traveler",
"travellers": "travelers",
"travelling": "traveling",
"travelog": "travelogue",
"travelogs": "travelogues",
"trialled": "trialed",
"trialling": "trialing",
"tricolour": "tricolor",
"tricolours": "tricolors",
"trivialise": "trivialize",
"trivialised": "trivialized",
"trivialises": "trivializes",
"trivialising": "trivializing",
"tumour": "tumor",
"tumours": "tumors",
"tunnelled": "tunneled",
"tunnelling": "tunneling",
"tyrannise": "tyrannize",
"tyrannised": "tyrannized",
"tyrannises": "tyrannizes",
"tyrannising": "tyrannizing",
"tyre": "tire",
"tyres": "tires",
"unauthorised": "unauthorized",
"uncivilised": "uncivilized",
"underutilised": "underutilized",
"unequalled": "unequaled",
"unfavourable": "unfavorable",
"unfavourably": "unfavorably",
"unionisation": "unionization",
"unionise": "unionize",
"unionised": "unionized",
"unionises": "unionizes",
"unionising": "unionizing",
"unorganised": "unorganized",
"unravelled": "unraveled",
"unravelling": "unraveling",
"unrecognisable": "unrecognizable",
"unrecognised": "unrecognized",
"unrivalled": "unrivaled",
"unsavoury": "unsavory",
"untrammelled": "untrammeled",
"urbanisation": "urbanization",
"urbanise": "urbanize",
"urbanised": "urbanized",
"urbanises": "urbanizes",
"urbanising": "urbanizing",
"utilisable": "utilizable",
"utilisation": "utilization",
"utilise": "utilize",
"utilised": "utilized",
"utilises": "utilizes",
"utilising": "utilizing",
"valour": "valor",
"vandalise": "vandalize",
"vandalised": "vandalized",
"vandalises": "vandalizes",
"vandalising": "vandalizing",
"vaporisation": "vaporization",
"vaporise": "vaporize",
"vaporised": "vaporized",
"vaporises": "vaporizes",
"vaporising": "vaporizing",
"vapour": "vapor",
"vapours": "vapors",
"verbalise": "verbalize",
"verbalised": "verbalized",
"verbalises": "verbalizes",
"verbalising": "verbalizing",
"victimisation": "victimization",
"victimise": "victimize",
"victimised": "victimized",
"victimises": "victimizes",
"victimising": "victimizing",
"videodisc": "videodisk",
"videodiscs": "videodisks",
"vigour": "vigor",
"visualisation": "visualization",
"visualisations": "visualizations",
"visualise": "visualize",
"visualised": "visualized",
"visualises": "visualizes",
"visualising": "visualizing",
"vocalisation": "vocalization",
"vocalisations": "vocalizations",
"vocalise": "vocalize",
"vocalised": "vocalized",
"vocalises": "vocalizes",
"vocalising": "vocalizing",
"vulcanised": "vulcanized",
"vulgarisation": "vulgarization",
"vulgarise": "vulgarize",
"vulgarised": "vulgarized",
"vulgarises": "vulgarizes",
"vulgarising": "vulgarizing",
"waggon": "wagon",
"waggons": "wagons",
"watercolour": "watercolor",
"watercolours": "watercolors",
"weaselled": "weaseled",
"weaselling": "weaseling",
"westernisation": "westernization",
"westernise": "westernize",
"westernised": "westernized",
"westernises": "westernizes",
"westernising": "westernizing",
"womanise": "womanize",
"womanised": "womanized",
"womaniser": "womanizer",
"womanisers": "womanizers",
"womanises": "womanizes",
"womanising": "womanizing",
"woollen": "woolen",
"woollens": "woolens",
"woollies": "woolies",
"woolly": "wooly",
"worshipped": "worshiped",
"worshipping": "worshiping",
"worshipper": "worshiper",
"yodelled": "yodeled",
"yodelling": "yodeling",
"yoghourt": "yogurt",
"yoghourts": "yogurts",
"yoghurt": "yogurt",
"yoghurts": "yogurts",
"mhm": "hmm",
"mm": "hmm",
"mmm": "hmm"
}
\ No newline at end of file
import json
import os
import re
from fractions import Fraction
from typing import Iterator, List, Match, Optional, Union
from more_itertools import windowed
from .basic import remove_symbols_and_diacritics
class EnglishNumberNormalizer:
"""
Convert any spelled-out numbers into arabic numbers, while handling:
- remove any commas
- keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
- spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
- spell out `one` and `ones`
- interpret successive single-digit numbers as nominal: `one oh one` -> `101`
"""
def __init__(self):
super().__init__()
self.zeros = {"o", "oh", "zero"}
self.ones = {
name: i
for i, name in enumerate(
[
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
],
start=1,
)
}
self.ones_plural = {
"sixes" if name == "six" else name + "s": (value, "s")
for name, value in self.ones.items()
}
self.ones_ordinal = {
"zeroth": (0, "th"),
"first": (1, "st"),
"second": (2, "nd"),
"third": (3, "rd"),
"fifth": (5, "th"),
"twelfth": (12, "th"),
**{
name + ("h" if name.endswith("t") else "th"): (value, "th")
for name, value in self.ones.items()
if value > 3 and value != 5 and value != 12
},
}
self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
self.tens = {
"twenty": 20,
"thirty": 30,
"forty": 40,
"fifty": 50,
"sixty": 60,
"seventy": 70,
"eighty": 80,
"ninety": 90,
}
self.tens_plural = {
name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
}
self.tens_ordinal = {
name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()
}
self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
self.multipliers = {
"hundred": 100,
"thousand": 1_000,
"million": 1_000_000,
"billion": 1_000_000_000,
"trillion": 1_000_000_000_000,
"quadrillion": 1_000_000_000_000_000,
"quintillion": 1_000_000_000_000_000_000,
"sextillion": 1_000_000_000_000_000_000_000,
"septillion": 1_000_000_000_000_000_000_000_000,
"octillion": 1_000_000_000_000_000_000_000_000_000,
"nonillion": 1_000_000_000_000_000_000_000_000_000_000,
"decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
}
self.multipliers_plural = {
name + "s": (value, "s") for name, value in self.multipliers.items()
}
self.multipliers_ordinal = {
name + "th": (value, "th") for name, value in self.multipliers.items()
}
self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal}
self.decimals = {*self.ones, *self.tens, *self.zeros}
self.preceding_prefixers = {
"minus": "-",
"negative": "-",
"plus": "+",
"positive": "+",
}
self.following_prefixers = {
"pound": "£",
"pounds": "£",
"euro": "€",
"euros": "€",
"dollar": "$",
"dollars": "$",
"cent": "¢",
"cents": "¢",
}
self.prefixes = set(
list(self.preceding_prefixers.values()) + list(self.following_prefixers.values())
)
self.suffixers = {
"per": {"cent": "%"},
"percent": "%",
}
self.specials = {"and", "double", "triple", "point"}
self.words = set(
[
key
for mapping in [
self.zeros,
self.ones,
self.ones_suffixed,
self.tens,
self.tens_suffixed,
self.multipliers,
self.multipliers_suffixed,
self.preceding_prefixers,
self.following_prefixers,
self.suffixers,
self.specials,
]
for key in mapping
]
)
self.literal_words = {"one", "ones"}
def process_words(self, words: List[str]) -> Iterator[str]:
prefix: Optional[str] = None
value: Optional[Union[str, int]] = None
skip = False
def to_fraction(s: str):
try:
return Fraction(s)
except ValueError:
return None
def output(result: Union[str, int]):
nonlocal prefix, value
result = str(result)
if prefix is not None:
result = prefix + result
value = None
prefix = None
return result
if len(words) == 0:
return
for prev, current, next in windowed([None] + words + [None], 3):
if skip:
skip = False
continue
next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
has_prefix = current[0] in self.prefixes
current_without_prefix = current[1:] if has_prefix else current
if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
# arabic numbers (potentially with signs and fractions)
f = to_fraction(current_without_prefix)
assert f is not None
if value is not None:
if isinstance(value, str) and value.endswith("."):
# concatenate decimals / ip address components
value = str(value) + str(current)
continue
else:
yield output(value)
prefix = current[0] if has_prefix else prefix
if f.denominator == 1:
value = f.numerator # store integers as int
else:
value = current_without_prefix
elif current not in self.words:
# non-numeric words
if value is not None:
yield output(value)
yield output(current)
elif current in self.zeros:
value = str(value or "") + "0"
elif current in self.ones:
ones = self.ones[current]
if value is None:
value = ones
elif isinstance(value, str) or prev in self.ones:
if prev in self.tens and ones < 10: # replace the last zero with the digit
assert value[-1] == "0"
value = value[:-1] + str(ones)
else:
value = str(value) + str(ones)
elif ones < 10:
if value % 10 == 0:
value += ones
else:
value = str(value) + str(ones)
else: # eleven to nineteen
if value % 100 == 0:
value += ones
else:
value = str(value) + str(ones)
elif current in self.ones_suffixed:
# ordinal or cardinal; yield the number right away
ones, suffix = self.ones_suffixed[current]
if value is None:
yield output(str(ones) + suffix)
elif isinstance(value, str) or prev in self.ones:
if prev in self.tens and ones < 10:
assert value[-1] == "0"
yield output(value[:-1] + str(ones) + suffix)
else:
yield output(str(value) + str(ones) + suffix)
elif ones < 10:
if value % 10 == 0:
yield output(str(value + ones) + suffix)
else:
yield output(str(value) + str(ones) + suffix)
else: # eleven to nineteen
if value % 100 == 0:
yield output(str(value + ones) + suffix)
else:
yield output(str(value) + str(ones) + suffix)
value = None
elif current in self.tens:
tens = self.tens[current]
if value is None:
value = tens
elif isinstance(value, str):
value = str(value) + str(tens)
else:
if value % 100 == 0:
value += tens
else:
value = str(value) + str(tens)
elif current in self.tens_suffixed:
# ordinal or cardinal; yield the number right away
tens, suffix = self.tens_suffixed[current]
if value is None:
yield output(str(tens) + suffix)
elif isinstance(value, str):
yield output(str(value) + str(tens) + suffix)
else:
if value % 100 == 0:
yield output(str(value + tens) + suffix)
else:
yield output(str(value) + str(tens) + suffix)
elif current in self.multipliers:
multiplier = self.multipliers[current]
if value is None:
value = multiplier
elif isinstance(value, str) or value == 0:
f = to_fraction(value)
p = f * multiplier if f is not None else None
if f is not None and p.denominator == 1:
value = p.numerator
else:
yield output(value)
value = multiplier
else:
before = value // 1000 * 1000
residual = value % 1000
value = before + residual * multiplier
elif current in self.multipliers_suffixed:
multiplier, suffix = self.multipliers_suffixed[current]
if value is None:
yield output(str(multiplier) + suffix)
elif isinstance(value, str):
f = to_fraction(value)
p = f * multiplier if f is not None else None
if f is not None and p.denominator == 1:
yield output(str(p.numerator) + suffix)
else:
yield output(value)
yield output(str(multiplier) + suffix)
else: # int
before = value // 1000 * 1000
residual = value % 1000
value = before + residual * multiplier
yield output(str(value) + suffix)
value = None
elif current in self.preceding_prefixers:
# apply prefix (positive, minus, etc.) if it precedes a number
if value is not None:
yield output(value)
if next in self.words or next_is_numeric:
prefix = self.preceding_prefixers[current]
else:
yield output(current)
elif current in self.following_prefixers:
# apply prefix (dollars, cents, etc.) only after a number
if value is not None:
prefix = self.following_prefixers[current]
yield output(value)
else:
yield output(current)
elif current in self.suffixers:
# apply suffix symbols (percent -> '%')
if value is not None:
suffix = self.suffixers[current]
if isinstance(suffix, dict):
if next in suffix:
yield output(str(value) + suffix[next])
skip = True
else:
yield output(value)
yield output(current)
else:
yield output(str(value) + suffix)
else:
yield output(current)
elif current in self.specials:
if next not in self.words and not next_is_numeric:
# apply special handling only if the next word can be numeric
if value is not None:
yield output(value)
yield output(current)
elif current == "and":
# ignore "and" after hundreds, thousands, etc.
if prev not in self.multipliers:
if value is not None:
yield output(value)
yield output(current)
elif current == "double" or current == "triple":
if next in self.ones or next in self.zeros:
repeats = 2 if current == "double" else 3
ones = self.ones.get(next, 0)
value = str(value or "") + str(ones) * repeats
skip = True
else:
if value is not None:
yield output(value)
yield output(current)
elif current == "point":
if next in self.decimals or next_is_numeric:
value = str(value or "") + "."
else:
# should all have been covered at this point
raise ValueError(f"Unexpected token: {current}")
else:
# all should have been covered at this point
raise ValueError(f"Unexpected token: {current}")
if value is not None:
yield output(value)
def preprocess(self, s: str):
# replace "<number> and a half" with "<number> point five"
results = []
segments = re.split(r"\band\s+a\s+half\b", s)
for i, segment in enumerate(segments):
if len(segment.strip()) == 0:
continue
if i == len(segments) - 1:
results.append(segment)
else:
results.append(segment)
last_word = segment.rsplit(maxsplit=2)[-1]
if last_word in self.decimals or last_word in self.multipliers:
results.append("point five")
else:
results.append("and a half")
s = " ".join(results)
# put a space at number/letter boundary
s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
# but remove spaces which could be a suffix
s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
return s
def postprocess(self, s: str):
def combine_cents(m: Match):
try:
currency = m.group(1)
integer = m.group(2)
cents = int(m.group(3))
return f"{currency}{integer}.{cents:02d}"
except ValueError:
return m.string
def extract_cents(m: Match):
try:
return f{int(m.group(1))}"
except ValueError:
return m.string
# apply currency postprocessing; "$2 and ¢7" -> "$2.07"
s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
# write "one(s)" instead of "1(s)", just for the readability
s = re.sub(r"\b1(s?)\b", r"one\1", s)
return s
def __call__(self, s: str):
s = self.preprocess(s)
s = " ".join(word for word in self.process_words(s.split()) if word is not None)
s = self.postprocess(s)
return s
class EnglishSpellingNormalizer:
"""
Applies British-American spelling mappings as listed in [1].
[1] https://www.tysto.com/uk-us-spelling-list.html
"""
def __init__(self):
mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
self.mapping = json.load(open(mapping_path))
def __call__(self, s: str):
return " ".join(self.mapping.get(word, word) for word in s.split())
class EnglishTextNormalizer:
def __init__(self):
self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
self.replacers = {
# common contractions
r"\bwon't\b": "will not",
r"\bcan't\b": "can not",
r"\blet's\b": "let us",
r"\bain't\b": "aint",
r"\by'all\b": "you all",
r"\bwanna\b": "want to",
r"\bgotta\b": "got to",
r"\bgonna\b": "going to",
r"\bi'ma\b": "i am going to",
r"\bimma\b": "i am going to",
r"\bwoulda\b": "would have",
r"\bcoulda\b": "could have",
r"\bshoulda\b": "should have",
r"\bma'am\b": "madam",
# contractions in titles/prefixes
r"\bmr\b": "mister ",
r"\bmrs\b": "missus ",
r"\bst\b": "saint ",
r"\bdr\b": "doctor ",
r"\bprof\b": "professor ",
r"\bcapt\b": "captain ",
r"\bgov\b": "governor ",
r"\bald\b": "alderman ",
r"\bgen\b": "general ",
r"\bsen\b": "senator ",
r"\brep\b": "representative ",
r"\bpres\b": "president ",
r"\brev\b": "reverend ",
r"\bhon\b": "honorable ",
r"\basst\b": "assistant ",
r"\bassoc\b": "associate ",
r"\blt\b": "lieutenant ",
r"\bcol\b": "colonel ",
r"\bjr\b": "junior ",
r"\bsr\b": "senior ",
r"\besq\b": "esquire ",
# prefect tenses, ideally it should be any past participles, but it's harder..
r"'d been\b": " had been",
r"'s been\b": " has been",
r"'d gone\b": " had gone",
r"'s gone\b": " has gone",
r"'d done\b": " had done", # "'s done" is ambiguous
r"'s got\b": " has got",
# general contractions
r"n't\b": " not",
r"'re\b": " are",
r"'s\b": " is",
r"'d\b": " would",
r"'ll\b": " will",
r"'t\b": " not",
r"'ve\b": " have",
r"'m\b": " am",
}
self.standardize_numbers = EnglishNumberNormalizer()
self.standardize_spellings = EnglishSpellingNormalizer()
def __call__(self, s: str):
s = s.lower()
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
s = re.sub(self.ignore_patterns, "", s)
s = re.sub(r"\s+'", "'", s) # standardize when there's a space before an apostrophe
for pattern, replacement in self.replacers.items():
s = re.sub(pattern, replacement, s)
s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers
s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep some symbols for numerics
s = self.standardize_numbers(s)
s = self.standardize_spellings(s)
# now remove prefix/suffix symbols that are not preceded/followed by numbers
s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
s = re.sub(r"([^0-9])%", r"\1 ", s)
s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space
return s
import os
from dataclasses import dataclass
from functools import lru_cache
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from transformers import GPT2TokenizerFast
LANGUAGES = {
"en": "english",
"zh": "chinese",
"de": "german",
"es": "spanish",
"ru": "russian",
"ko": "korean",
"fr": "french",
"ja": "japanese",
"pt": "portuguese",
"tr": "turkish",
"pl": "polish",
"ca": "catalan",
"nl": "dutch",
"ar": "arabic",
"sv": "swedish",
"it": "italian",
"id": "indonesian",
"hi": "hindi",
"fi": "finnish",
"vi": "vietnamese",
"iw": "hebrew",
"uk": "ukrainian",
"el": "greek",
"ms": "malay",
"cs": "czech",
"ro": "romanian",
"da": "danish",
"hu": "hungarian",
"ta": "tamil",
"no": "norwegian",
"th": "thai",
"ur": "urdu",
"hr": "croatian",
"bg": "bulgarian",
"lt": "lithuanian",
"la": "latin",
"mi": "maori",
"ml": "malayalam",
"cy": "welsh",
"sk": "slovak",
"te": "telugu",
"fa": "persian",
"lv": "latvian",
"bn": "bengali",
"sr": "serbian",
"az": "azerbaijani",
"sl": "slovenian",
"kn": "kannada",
"et": "estonian",
"mk": "macedonian",
"br": "breton",
"eu": "basque",
"is": "icelandic",
"hy": "armenian",
"ne": "nepali",
"mn": "mongolian",
"bs": "bosnian",
"kk": "kazakh",
"sq": "albanian",
"sw": "swahili",
"gl": "galician",
"mr": "marathi",
"pa": "punjabi",
"si": "sinhala",
"km": "khmer",
"sn": "shona",
"yo": "yoruba",
"so": "somali",
"af": "afrikaans",
"oc": "occitan",
"ka": "georgian",
"be": "belarusian",
"tg": "tajik",
"sd": "sindhi",
"gu": "gujarati",
"am": "amharic",
"yi": "yiddish",
"lo": "lao",
"uz": "uzbek",
"fo": "faroese",
"ht": "haitian creole",
"ps": "pashto",
"tk": "turkmen",
"nn": "nynorsk",
"mt": "maltese",
"sa": "sanskrit",
"lb": "luxembourgish",
"my": "myanmar",
"bo": "tibetan",
"tl": "tagalog",
"mg": "malagasy",
"as": "assamese",
"tt": "tatar",
"haw": "hawaiian",
"ln": "lingala",
"ha": "hausa",
"ba": "bashkir",
"jw": "javanese",
"su": "sundanese",
}
# language code lookup by name, with a few language aliases
TO_LANGUAGE_CODE = {
**{language: code for code, language in LANGUAGES.items()},
"burmese": "my",
"valencian": "ca",
"flemish": "nl",
"haitian": "ht",
"letzeburgesch": "lb",
"pushto": "ps",
"panjabi": "pa",
"moldavian": "ro",
"moldovan": "ro",
"sinhalese": "si",
"castilian": "es",
}
@dataclass(frozen=True)
class Tokenizer:
"""A thin wrapper around `GPT2TokenizerFast` providing quick access to special tokens"""
tokenizer: "GPT2TokenizerFast"
language: Optional[str]
sot_sequence: Tuple[int]
def encode(self, text, **kwargs):
return self.tokenizer.encode(text, **kwargs)
def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Tensor], **kwargs):
return self.tokenizer.decode(token_ids, **kwargs)
def decode_with_timestamps(self, tokens) -> str:
"""
Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
"""
outputs = [[]]
for token in tokens:
if token >= self.timestamp_begin:
timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
outputs.append(timestamp)
outputs.append([])
else:
outputs[-1].append(token)
outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
return "".join(outputs)
@property
@lru_cache()
def eot(self) -> int:
return self.tokenizer.eos_token_id
@property
@lru_cache()
def sot(self) -> int:
return self._get_single_token_id("<|startoftranscript|>")
@property
@lru_cache()
def sot_lm(self) -> int:
return self._get_single_token_id("<|startoflm|>")
@property
@lru_cache()
def sot_prev(self) -> int:
return self._get_single_token_id("<|startofprev|>")
@property
@lru_cache()
def no_speech(self) -> int:
return self._get_single_token_id("<|nospeech|>")
@property
@lru_cache()
def no_timestamps(self) -> int:
return self._get_single_token_id("<|notimestamps|>")
@property
@lru_cache()
def timestamp_begin(self) -> int:
return self.tokenizer.all_special_ids[-1] + 1
@property
@lru_cache()
def language_token(self) -> int:
"""Returns the token id corresponding to the value of the `language` field"""
if self.language is None:
raise ValueError(f"This tokenizer does not have language token configured")
additional_tokens = dict(
zip(
self.tokenizer.additional_special_tokens,
self.tokenizer.additional_special_tokens_ids,
)
)
candidate = f"<|{self.language}|>"
if candidate in additional_tokens:
return additional_tokens[candidate]
raise KeyError(f"Language {self.language} not found in tokenizer.")
@property
@lru_cache()
def all_language_tokens(self) -> Tuple[int]:
result = []
for token, token_id in zip(
self.tokenizer.additional_special_tokens,
self.tokenizer.additional_special_tokens_ids,
):
if token.strip("<|>") in LANGUAGES:
result.append(token_id)
return tuple(result)
@property
@lru_cache()
def all_language_codes(self) -> Tuple[str]:
return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
@property
@lru_cache()
def sot_sequence_including_notimestamps(self) -> Tuple[int]:
return tuple(list(self.sot_sequence) + [self.no_timestamps])
@property
@lru_cache()
def non_speech_tokens(self) -> Tuple[int]:
"""
Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
- ♪♪♪
- ( SPEAKING FOREIGN LANGUAGE )
- [DAVID] Hey there,
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
"""
symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
# symbols that may be a single token or multiple tokens depending on the tokenizer.
# In case they're multiple tokens, suppress the first token, which is safe because:
# These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
# in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
miscellaneous = set("♩♪♫♬♭♮♯")
assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
# allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
for symbol in symbols + list(miscellaneous):
for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
if len(tokens) == 1 or symbol in miscellaneous:
result.add(tokens[0])
return tuple(sorted(result))
def _get_single_token_id(self, text) -> int:
tokens = self.tokenizer.encode(text)
assert len(tokens) == 1, f"{text} is not encoded as a single token"
return tokens[0]
@lru_cache(maxsize=None)
def build_tokenizer(name: str = "gpt2"):
os.environ["TOKENIZERS_PARALLELISM"] = "false"
path = os.path.join(os.path.dirname(__file__), "assets", name)
tokenizer = GPT2TokenizerFast.from_pretrained(path)
specials = [
"<|startoftranscript|>",
*[f"<|{lang}|>" for lang in LANGUAGES.keys()],
"<|translate|>",
"<|transcribe|>",
"<|startoflm|>",
"<|startofprev|>",
"<|nospeech|>",
"<|notimestamps|>",
]
tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
return tokenizer
@lru_cache(maxsize=None)
def get_tokenizer(
multilingual: bool,
*,
task: Optional[str] = None, # Literal["transcribe", "translate", None]
language: Optional[str] = None,
) -> Tokenizer:
if language is not None:
language = language.lower()
if language not in LANGUAGES:
if language in TO_LANGUAGE_CODE:
language = TO_LANGUAGE_CODE[language]
else:
raise ValueError(f"Unsupported language: {language}")
if multilingual:
tokenizer_name = "multilingual"
task = task or "transcribe"
language = language or "en"
else:
tokenizer_name = "gpt2"
task = None
language = None
tokenizer = build_tokenizer(name=tokenizer_name)
all_special_ids: List[int] = tokenizer.all_special_ids
sot: int = all_special_ids[1]
translate: int = all_special_ids[-6]
transcribe: int = all_special_ids[-5]
langs = tuple(LANGUAGES.keys())
sot_sequence = [sot]
if language is not None:
sot_sequence.append(sot + 1 + langs.index(language))
if task is not None:
sot_sequence.append(transcribe if task == "transcribe" else translate)
return Tokenizer(tokenizer=tokenizer, language=language, sot_sequence=tuple(sot_sequence))
import argparse
import os
import warnings
from typing import List, Optional, Tuple, Union, TYPE_CHECKING
import numpy as np
import torch
import tqdm
from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram
from .decoding import DecodingOptions, DecodingResult
from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
from .utils import exact_div, format_timestamp, optional_int, optional_float, str2bool, write_txt, write_vtt, write_srt
if TYPE_CHECKING:
from .model import Whisper
def transcribe(
model: "Whisper",
audio: Union[str, np.ndarray, torch.Tensor],
*,
verbose: Optional[bool] = None,
temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
compression_ratio_threshold: Optional[float] = 2.4,
logprob_threshold: Optional[float] = -1.0,
no_speech_threshold: Optional[float] = 0.6,
condition_on_previous_text: bool = True,
force_extraction: bool = False,
**decode_options,
):
"""
Transcribe an audio file using Whisper
Parameters
----------
model: Whisper
The Whisper model instance
audio: Union[str, np.ndarray, torch.Tensor]
The path to the audio file to open, or the audio waveform
verbose: bool
Whether to display the text being decoded to the console. If True, displays all the details,
If False, displays minimal details. If None, does not display anything
temperature: Union[float, Tuple[float, ...]]
Temperature for sampling. It can be a tuple of temperatures, which will be successfully used
upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
compression_ratio_threshold: float
If the gzip compression ratio is above this value, treat as failed
logprob_threshold: float
If the average log probability over sampled tokens is below this value, treat as failed
no_speech_threshold: float
If the no_speech probability is higher than this value AND the average log probability
over sampled tokens is below `logprob_threshold`, consider the segment as silent
condition_on_previous_text: bool
if True, the previous output of the model is provided as a prompt for the next window;
disabling may make the text inconsistent across windows, but the model becomes less prone to
getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
decode_options: dict
Keyword arguments to construct `DecodingOptions` instances
Returns
-------
A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
the spoken language ("language"), which is detected when `decode_options["language"]` is None.
"""
dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
if model.device == torch.device("cpu"):
if torch.cuda.is_available():
warnings.warn("Performing inference on CPU when CUDA is available")
if dtype == torch.float16:
warnings.warn("FP16 is not supported on CPU; using FP32 instead")
dtype = torch.float32
if dtype == torch.float32:
decode_options["fp16"] = False
mel = log_mel_spectrogram(audio)
all_segments = []
def add_segment(
*, start: float, end: float, encoder_embeddings
):
all_segments.append(
{
"start": start,
"end": end,
"encoder_embeddings":encoder_embeddings,
}
)
# show the progress bar when verbose is False (otherwise the transcribed text will be printed)
num_frames = mel.shape[-1]
seek = 0
previous_seek_value = seek
sample_skip = 3000 #
with tqdm.tqdm(total=num_frames, unit='frames', disable=verbose is not False) as pbar:
while seek < num_frames:
# seek是开始的帧数
end_seek = min(seek + sample_skip, num_frames)
segment = pad_or_trim(mel[:,seek:seek+sample_skip], N_FRAMES).to(model.device).to(dtype)
single = segment.ndim == 2
if single:
segment = segment.unsqueeze(0)
if dtype == torch.float16:
segment = segment.half()
audio_features, embeddings = model.encoder(segment, include_embeddings = True)
encoder_embeddings = embeddings
#print(f"encoder_embeddings shape {encoder_embeddings.shape}")
add_segment(
start=seek,
end=end_seek,
#text_tokens=tokens,
#result=result,
encoder_embeddings=encoder_embeddings,
)
seek+=sample_skip
return dict(segments=all_segments)
def cli():
from . import available_models
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
args = parser.parse_args().__dict__
model_name: str = args.pop("model")
model_dir: str = args.pop("model_dir")
output_dir: str = args.pop("output_dir")
device: str = args.pop("device")
os.makedirs(output_dir, exist_ok=True)
if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
if args["language"] is not None:
warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
args["language"] = "en"
temperature = args.pop("temperature")
temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
if temperature_increment_on_fallback is not None:
temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
else:
temperature = [temperature]
threads = args.pop("threads")
if threads > 0:
torch.set_num_threads(threads)
from . import load_model
model = load_model(model_name, device=device, download_root=model_dir)
for audio_path in args.pop("audio"):
result = transcribe(model, audio_path, temperature=temperature, **args)
audio_basename = os.path.basename(audio_path)
# save TXT
with open(os.path.join(output_dir, audio_basename + ".txt"), "w", encoding="utf-8") as txt:
write_txt(result["segments"], file=txt)
# save VTT
with open(os.path.join(output_dir, audio_basename + ".vtt"), "w", encoding="utf-8") as vtt:
write_vtt(result["segments"], file=vtt)
# save SRT
with open(os.path.join(output_dir, audio_basename + ".srt"), "w", encoding="utf-8") as srt:
write_srt(result["segments"], file=srt)
if __name__ == '__main__':
cli()
import zlib
from typing import Iterator, TextIO
def exact_div(x, y):
assert x % y == 0
return x // y
def str2bool(string):
str2val = {"True": True, "False": False}
if string in str2val:
return str2val[string]
else:
raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
def optional_int(string):
return None if string == "None" else int(string)
def optional_float(string):
return None if string == "None" else float(string)
def compression_ratio(text) -> float:
return len(text) / len(zlib.compress(text.encode("utf-8")))
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
def write_txt(transcript: Iterator[dict], file: TextIO):
for segment in transcript:
print(segment['text'].strip(), file=file, flush=True)
def write_vtt(transcript: Iterator[dict], file: TextIO):
print("WEBVTT\n", file=file)
for segment in transcript:
print(
f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
f"{segment['text'].strip().replace('-->', '->')}\n",
file=file,
flush=True,
)
def write_srt(transcript: Iterator[dict], file: TextIO):
"""
Write a transcript to a file in SRT format.
Example usage:
from pathlib import Path
from whisper.utils import write_srt
result = transcribe(model, audio_path, temperature=temperature, **args)
# save SRT
audio_basename = Path(audio_path).stem
with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
write_srt(result["segments"], file=srt)
"""
for i, segment in enumerate(transcript, start=1):
# write srt lines
print(
f"{i}\n"
f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
f"{segment['text'].strip().replace('-->', '->')}\n",
file=file,
flush=True,
)
ffmpeg
libgl1
\ No newline at end of file
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from latentsync.utils.util import read_video, write_video
from latentsync.utils.image_processor import ImageProcessor
import torch
from einops import rearrange
import os
import tqdm
import subprocess
from multiprocessing import Process
import shutil
paths = []
def gather_video_paths(input_dir, output_dir):
for video in sorted(os.listdir(input_dir)):
if video.endswith(".mp4"):
video_input = os.path.join(input_dir, video)
video_output = os.path.join(output_dir, video)
if os.path.isfile(video_output):
continue
paths.append((video_input, video_output))
elif os.path.isdir(os.path.join(input_dir, video)):
gather_video_paths(os.path.join(input_dir, video), os.path.join(output_dir, video))
class FaceDetector:
def __init__(self, resolution: int = 512, device: str = "cpu"):
self.image_processor = ImageProcessor(resolution, "fix_mask", device)
def affine_transform_video(self, video_path):
video_frames = read_video(video_path, change_fps=False)
results = []
for frame in video_frames:
frame, _, _ = self.image_processor.affine_transform(frame)
results.append(frame)
results = torch.stack(results)
results = rearrange(results, "f c h w -> f h w c").numpy()
return results
def close(self):
self.image_processor.close()
def combine_video_audio(video_frames, video_input_path, video_output_path, process_temp_dir):
video_name = os.path.basename(video_input_path)[:-4]
audio_temp = os.path.join(process_temp_dir, f"{video_name}_temp.wav")
video_temp = os.path.join(process_temp_dir, f"{video_name}_temp.mp4")
write_video(video_temp, video_frames, fps=25)
command = f"ffmpeg -y -loglevel error -i {video_input_path} -q:a 0 -map a {audio_temp}"
subprocess.run(command, shell=True)
os.makedirs(os.path.dirname(video_output_path), exist_ok=True)
command = f"ffmpeg -y -loglevel error -i {video_temp} -i {audio_temp} -c:v libx264 -c:a aac -map 0:v -map 1:a -q:v 0 -q:a 0 {video_output_path}"
subprocess.run(command, shell=True)
os.remove(audio_temp)
os.remove(video_temp)
def func(paths, process_temp_dir, device_id, resolution):
os.makedirs(process_temp_dir, exist_ok=True)
face_detector = FaceDetector(resolution, f"cuda:{device_id}")
for video_input, video_output in paths:
if os.path.isfile(video_output):
continue
try:
video_frames = face_detector.affine_transform_video(video_input)
except Exception as e: # Handle the exception of face not detcted
print(f"Exception: {e} - {video_input}")
continue
os.makedirs(os.path.dirname(video_output), exist_ok=True)
combine_video_audio(video_frames, video_input, video_output, process_temp_dir)
print(f"Saved: {video_output}")
face_detector.close()
def split(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
def affine_transform_multi_gpus(input_dir, output_dir, temp_dir, resolution, num_workers):
print(f"Recursively gathering video paths of {input_dir} ...")
gather_video_paths(input_dir, output_dir)
num_devices = torch.cuda.device_count()
if num_devices == 0:
raise RuntimeError("No GPUs found")
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
os.makedirs(temp_dir, exist_ok=True)
split_paths = list(split(paths, num_workers * num_devices))
processes = []
for i in range(num_devices):
for j in range(num_workers):
process_index = i * num_workers + j
process = Process(
target=func, args=(split_paths[process_index], os.path.join(temp_dir, f"process_{i}"), i, resolution)
)
process.start()
processes.append(process)
for process in processes:
process.join()
if __name__ == "__main__":
input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/avatars/resampled/train"
output_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/avatars/affine_transformed/train"
temp_dir = "temp"
resolution = 256
num_workers = 10 # How many processes per device
affine_transform_multi_gpus(input_dir, output_dir, temp_dir, resolution, num_workers)
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from preprocess.affine_transform import affine_transform_multi_gpus
from preprocess.remove_broken_videos import remove_broken_videos_multiprocessing
from preprocess.detect_shot import detect_shot_multiprocessing
from preprocess.filter_high_resolution import filter_high_resolution_multiprocessing
from preprocess.resample_fps_hz import resample_fps_hz_multiprocessing
from preprocess.segment_videos import segment_videos_multiprocessing
from preprocess.sync_av import sync_av_multi_gpus
from preprocess.filter_visual_quality import filter_visual_quality_multi_gpus
from preprocess.remove_incorrect_affined import remove_incorrect_affined_multiprocessing
def data_processing_pipeline(
total_num_workers, per_gpu_num_workers, resolution, sync_conf_threshold, temp_dir, input_dir
):
print("Removing broken videos...")
remove_broken_videos_multiprocessing(input_dir, total_num_workers)
print("Resampling FPS hz...")
resampled_dir = os.path.join(os.path.dirname(input_dir), "resampled")
resample_fps_hz_multiprocessing(input_dir, resampled_dir, total_num_workers)
print("Detecting shot...")
shot_dir = os.path.join(os.path.dirname(input_dir), "shot")
detect_shot_multiprocessing(resampled_dir, shot_dir, total_num_workers)
print("Segmenting videos...")
segmented_dir = os.path.join(os.path.dirname(input_dir), "segmented")
segment_videos_multiprocessing(shot_dir, segmented_dir, total_num_workers)
print("Filtering high resolution...")
high_resolution_dir = os.path.join(os.path.dirname(input_dir), "high_resolution")
filter_high_resolution_multiprocessing(segmented_dir, high_resolution_dir, resolution, total_num_workers)
print("Affine transforming videos...")
affine_transformed_dir = os.path.join(os.path.dirname(input_dir), "affine_transformed")
affine_transform_multi_gpus(
high_resolution_dir, affine_transformed_dir, temp_dir, resolution, per_gpu_num_workers // 2
)
print("Removing incorrect affined videos...")
remove_incorrect_affined_multiprocessing(affine_transformed_dir, total_num_workers)
print("Syncing audio and video...")
av_synced_dir = os.path.join(os.path.dirname(input_dir), f"av_synced_{sync_conf_threshold}")
sync_av_multi_gpus(affine_transformed_dir, av_synced_dir, temp_dir, per_gpu_num_workers, sync_conf_threshold)
print("Filtering visual quality...")
high_visual_quality_dir = os.path.join(os.path.dirname(input_dir), "high_visual_quality")
filter_visual_quality_multi_gpus(av_synced_dir, high_visual_quality_dir, per_gpu_num_workers)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--total_num_workers", type=int, default=100)
parser.add_argument("--per_gpu_num_workers", type=int, default=20)
parser.add_argument("--resolution", type=int, default=256)
parser.add_argument("--sync_conf_threshold", type=int, default=3)
parser.add_argument("--temp_dir", type=str, default="temp")
parser.add_argument("--input_dir", type=str, required=True)
args = parser.parse_args()
data_processing_pipeline(
args.total_num_workers,
args.per_gpu_num_workers,
args.resolution,
args.sync_conf_threshold,
args.temp_dir,
args.input_dir,
)
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import subprocess
import tqdm
from multiprocessing import Pool
paths = []
def gather_paths(input_dir, output_dir):
for video in sorted(os.listdir(input_dir)):
if video.endswith(".mp4"):
video_input = os.path.join(input_dir, video)
video_output = os.path.join(output_dir, video)
if os.path.isfile(video_output):
continue
paths.append([video_input, output_dir])
elif os.path.isdir(os.path.join(input_dir, video)):
gather_paths(os.path.join(input_dir, video), os.path.join(output_dir, video))
def detect_shot(video_input, output_dir):
os.makedirs(output_dir, exist_ok=True)
video = os.path.basename(video_input)[:-4]
command = f"scenedetect --quiet -i {video_input} detect-adaptive --threshold 2 split-video --filename '{video}_shot_$SCENE_NUMBER' --output {output_dir}"
# command = f"scenedetect --quiet -i {video_input} detect-adaptive --threshold 2 split-video --high-quality --filename '{video}_shot_$SCENE_NUMBER' --output {output_dir}"
subprocess.run(command, shell=True)
def multi_run_wrapper(args):
return detect_shot(*args)
def detect_shot_multiprocessing(input_dir, output_dir, num_workers):
print(f"Recursively gathering video paths of {input_dir} ...")
gather_paths(input_dir, output_dir)
print(f"Detecting shot of {input_dir} ...")
with Pool(num_workers) as pool:
for _ in tqdm.tqdm(pool.imap_unordered(multi_run_wrapper, paths), total=len(paths)):
pass
if __name__ == "__main__":
input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/ads/high-resolution"
output_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/ads/shot"
num_workers = 50
detect_shot_multiprocessing(input_dir, output_dir, num_workers)
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import mediapipe as mp
from latentsync.utils.util import read_video
import os
import tqdm
import shutil
from multiprocessing import Pool
paths = []
def gather_video_paths(input_dir, output_dir, resolution):
for video in sorted(os.listdir(input_dir)):
if video.endswith(".mp4"):
video_input = os.path.join(input_dir, video)
video_output = os.path.join(output_dir, video)
if os.path.isfile(video_output):
continue
paths.append([video_input, video_output, resolution])
elif os.path.isdir(os.path.join(input_dir, video)):
gather_video_paths(os.path.join(input_dir, video), os.path.join(output_dir, video), resolution)
class FaceDetector:
def __init__(self, resolution=256):
self.face_detection = mp.solutions.face_detection.FaceDetection(
model_selection=0, min_detection_confidence=0.5
)
self.resolution = resolution
def detect_face(self, image):
height, width = image.shape[:2]
# Process the image and detect faces.
results = self.face_detection.process(image)
if not results.detections: # Face not detected
raise Exception("Face not detected")
if len(results.detections) != 1:
return False
detection = results.detections[0] # Only use the first face in the image
bounding_box = detection.location_data.relative_bounding_box
face_width = int(bounding_box.width * width)
face_height = int(bounding_box.height * height)
if face_width < self.resolution or face_height < self.resolution:
return False
return True
def detect_video(self, video_path):
video_frames = read_video(video_path, change_fps=False)
if len(video_frames) == 0:
return False
for frame in video_frames:
if not self.detect_face(frame):
return False
return True
def close(self):
self.face_detection.close()
def filter_video(video_input, video_out, resolution):
if os.path.isfile(video_out):
return
face_detector = FaceDetector(resolution)
try:
save = face_detector.detect_video(video_input)
except Exception as e:
# print(f"Exception: {e} Input video: {video_input}")
face_detector.close()
return
if save:
os.makedirs(os.path.dirname(video_out), exist_ok=True)
shutil.copy(video_input, video_out)
face_detector.close()
def multi_run_wrapper(args):
return filter_video(*args)
def filter_high_resolution_multiprocessing(input_dir, output_dir, resolution, num_workers):
print(f"Recursively gathering video paths of {input_dir} ...")
gather_video_paths(input_dir, output_dir, resolution)
print(f"Filtering high resolution videos in {input_dir} ...")
with Pool(num_workers) as pool:
for _ in tqdm.tqdm(pool.imap_unordered(multi_run_wrapper, paths), total=len(paths)):
pass
if __name__ == "__main__":
input_dir = "/mnt/bn/maliva-gen-ai/lichunyu/HDTF/original/train"
output_dir = "/mnt/bn/maliva-gen-ai/lichunyu/HDTF/detected/train"
resolution = 256
num_workers = 50
filter_high_resolution_multiprocessing(input_dir, output_dir, resolution, num_workers)
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tqdm
import torch
import torchvision
import shutil
from multiprocessing import Process
import numpy as np
from decord import VideoReader
from einops import rearrange
from eval.hyper_iqa import HyperNet, TargetNet
paths = []
def gather_paths(input_dir, output_dir):
# os.makedirs(output_dir, exist_ok=True)
for video in tqdm.tqdm(sorted(os.listdir(input_dir))):
if video.endswith(".mp4"):
video_input = os.path.join(input_dir, video)
video_output = os.path.join(output_dir, video)
if os.path.isfile(video_output):
continue
paths.append((video_input, video_output))
elif os.path.isdir(os.path.join(input_dir, video)):
gather_paths(os.path.join(input_dir, video), os.path.join(output_dir, video))
def read_video(video_path: str):
vr = VideoReader(video_path)
first_frame = vr[0].asnumpy()
middle_frame = vr[len(vr) // 2].asnumpy()
last_frame = vr[-1].asnumpy()
vr.seek(0)
video_frames = np.stack([first_frame, middle_frame, last_frame], axis=0)
video_frames = torch.from_numpy(rearrange(video_frames, "b h w c -> b c h w"))
video_frames = video_frames / 255.0
return video_frames
def func(paths, device_id):
device = f"cuda:{device_id}"
model_hyper = HyperNet(16, 112, 224, 112, 56, 28, 14, 7).to(device)
model_hyper.train(False)
# load the pre-trained model on the koniq-10k dataset
model_hyper.load_state_dict((torch.load("checkpoints/auxiliary/koniq_pretrained.pkl")))
transforms = torchvision.transforms.Compose(
[
torchvision.transforms.CenterCrop(size=224),
torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
]
)
for video_input, video_output in paths:
try:
video_frames = read_video(video_input)
video_frames = transforms(video_frames)
video_frames = video_frames.clone().detach().to(device)
paras = model_hyper(video_frames) # 'paras' contains the network weights conveyed to target network
# Building target network
model_target = TargetNet(paras).cuda()
for param in model_target.parameters():
param.requires_grad = False
# Quality prediction
pred = model_target(paras["target_in_vec"]) # 'paras['target_in_vec']' is the input to target net
# quality score ranges from 0-100, a higher score indicates a better quality
quality_score = pred.mean().item()
print(f"Input video: {video_input}\nVisual quality score: {quality_score:.2f}")
if quality_score >= 40:
os.makedirs(os.path.dirname(video_output), exist_ok=True)
shutil.copy(video_input, video_output)
except Exception as e:
print(e)
def split(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
def filter_visual_quality_multi_gpus(input_dir, output_dir, num_workers):
gather_paths(input_dir, output_dir)
num_devices = torch.cuda.device_count()
if num_devices == 0:
raise RuntimeError("No GPUs found")
split_paths = list(split(paths, num_workers * num_devices))
processes = []
for i in range(num_devices):
for j in range(num_workers):
process_index = i * num_workers + j
process = Process(target=func, args=(split_paths[process_index], i))
process.start()
processes.append(process)
for process in processes:
process.join()
if __name__ == "__main__":
input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/av_synced_high"
output_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality"
num_workers = 20 # How many processes per device
filter_visual_quality_multi_gpus(input_dir, output_dir, num_workers)
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from multiprocessing import Pool
import tqdm
from latentsync.utils.av_reader import AVReader
from latentsync.utils.util import gather_video_paths_recursively
def remove_broken_video(video_path):
try:
AVReader(video_path)
except Exception:
os.remove(video_path)
def remove_broken_videos_multiprocessing(input_dir, num_workers):
video_paths = gather_video_paths_recursively(input_dir)
print("Removing broken videos...")
with Pool(num_workers) as pool:
for _ in tqdm.tqdm(pool.imap_unordered(remove_broken_video, video_paths), total=len(video_paths)):
pass
if __name__ == "__main__":
input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/multilingual/affine_transformed"
num_workers = 50
remove_broken_videos_multiprocessing(input_dir, num_workers)
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import mediapipe as mp
from latentsync.utils.util import read_video, gather_video_paths_recursively
import os
import tqdm
from multiprocessing import Pool
class FaceDetector:
def __init__(self):
self.face_detection = mp.solutions.face_detection.FaceDetection(
model_selection=0, min_detection_confidence=0.5
)
def detect_face(self, image):
# Process the image and detect faces.
results = self.face_detection.process(image)
if not results.detections: # Face not detected
return False
if len(results.detections) != 1:
return False
return True
def detect_video(self, video_path):
try:
video_frames = read_video(video_path, change_fps=False)
except Exception as e:
print(f"Exception: {e} - {video_path}")
return False
if len(video_frames) == 0:
return False
for frame in video_frames:
if not self.detect_face(frame):
return False
return True
def close(self):
self.face_detection.close()
def remove_incorrect_affined(video_path):
if not os.path.isfile(video_path):
return
face_detector = FaceDetector()
has_face = face_detector.detect_video(video_path)
if not has_face:
os.remove(video_path)
print(f"Removed: {video_path}")
face_detector.close()
def remove_incorrect_affined_multiprocessing(input_dir, num_workers):
video_paths = gather_video_paths_recursively(input_dir)
print(f"Total videos: {len(video_paths)}")
print(f"Removing incorrect affined videos in {input_dir} ...")
with Pool(num_workers) as pool:
for _ in tqdm.tqdm(pool.imap_unordered(remove_incorrect_affined, video_paths), total=len(video_paths)):
pass
if __name__ == "__main__":
input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/multilingual_dcc/high_visual_quality"
num_workers = 50
remove_incorrect_affined_multiprocessing(input_dir, num_workers)
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import subprocess
import tqdm
from multiprocessing import Pool
import cv2
paths = []
def gather_paths(input_dir, output_dir):
for video in sorted(os.listdir(input_dir)):
if video.endswith(".mp4"):
video_input = os.path.join(input_dir, video)
video_output = os.path.join(output_dir, video)
if os.path.isfile(video_output):
continue
paths.append([video_input, video_output])
elif os.path.isdir(os.path.join(input_dir, video)):
gather_paths(os.path.join(input_dir, video), os.path.join(output_dir, video))
def get_video_fps(video_path: str):
cam = cv2.VideoCapture(video_path)
fps = cam.get(cv2.CAP_PROP_FPS)
return fps
def resample_fps_hz(video_input, video_output):
os.makedirs(os.path.dirname(video_output), exist_ok=True)
if get_video_fps(video_input) == 25:
command = f"ffmpeg -loglevel error -y -i {video_input} -c:v copy -ar 16000 -q:a 0 {video_output}"
else:
command = f"ffmpeg -loglevel error -y -i {video_input} -r 25 -ar 16000 -q:a 0 {video_output}"
subprocess.run(command, shell=True)
def multi_run_wrapper(args):
return resample_fps_hz(*args)
def resample_fps_hz_multiprocessing(input_dir, output_dir, num_workers):
print(f"Recursively gathering video paths of {input_dir} ...")
gather_paths(input_dir, output_dir)
print(f"Resampling FPS and Hz of {input_dir} ...")
with Pool(num_workers) as pool:
for _ in tqdm.tqdm(pool.imap_unordered(multi_run_wrapper, paths), total=len(paths)):
pass
if __name__ == "__main__":
input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/HDTF/segmented/train"
output_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/HDTF/resampled_test"
num_workers = 20
resample_fps_hz_multiprocessing(input_dir, output_dir, num_workers)
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import subprocess
import tqdm
from multiprocessing import Pool
paths = []
def gather_paths(input_dir, output_dir):
for video in sorted(os.listdir(input_dir)):
if video.endswith(".mp4"):
video_basename = video[:-4]
video_input = os.path.join(input_dir, video)
video_output = os.path.join(output_dir, f"{video_basename}_%03d.mp4")
if os.path.isfile(video_output):
continue
paths.append([video_input, video_output])
elif os.path.isdir(os.path.join(input_dir, video)):
gather_paths(os.path.join(input_dir, video), os.path.join(output_dir, video))
def segment_video(video_input, video_output):
os.makedirs(os.path.dirname(video_output), exist_ok=True)
command = f"ffmpeg -loglevel error -y -i {video_input} -map 0 -c:v copy -segment_time 5 -f segment -reset_timestamps 1 -q:a 0 {video_output}"
# command = f'ffmpeg -loglevel error -y -i {video_input} -map 0 -segment_time 5 -f segment -reset_timestamps 1 -force_key_frames "expr:gte(t,n_forced*5)" -crf 18 -q:a 0 {video_output}'
subprocess.run(command, shell=True)
def multi_run_wrapper(args):
return segment_video(*args)
def segment_videos_multiprocessing(input_dir, output_dir, num_workers):
print(f"Recursively gathering video paths of {input_dir} ...")
gather_paths(input_dir, output_dir)
print(f"Segmenting videos of {input_dir} ...")
with Pool(num_workers) as pool:
for _ in tqdm.tqdm(pool.imap_unordered(multi_run_wrapper, paths), total=len(paths)):
pass
if __name__ == "__main__":
input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/avatars_new/cut"
output_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/avatars_new/segmented"
num_workers = 50
segment_videos_multiprocessing(input_dir, output_dir, num_workers)
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tqdm
from eval.syncnet import SyncNetEval
from eval.syncnet_detect import SyncNetDetector
from eval.eval_sync_conf import syncnet_eval
import torch
import subprocess
import shutil
from multiprocessing import Process
paths = []
def gather_paths(input_dir, output_dir):
# os.makedirs(output_dir, exist_ok=True)
for video in tqdm.tqdm(sorted(os.listdir(input_dir))):
if video.endswith(".mp4"):
video_input = os.path.join(input_dir, video)
video_output = os.path.join(output_dir, video)
if os.path.isfile(video_output):
continue
paths.append((video_input, video_output))
elif os.path.isdir(os.path.join(input_dir, video)):
gather_paths(os.path.join(input_dir, video), os.path.join(output_dir, video))
def adjust_offset(video_input: str, video_output: str, av_offset: int, fps: int = 25):
command = f"ffmpeg -loglevel error -y -i {video_input} -itsoffset {av_offset/fps} -i {video_input} -map 0:v -map 1:a -c copy -q:v 0 -q:a 0 {video_output}"
subprocess.run(command, shell=True)
def func(sync_conf_threshold, paths, device_id, process_temp_dir):
os.makedirs(process_temp_dir, exist_ok=True)
device = f"cuda:{device_id}"
syncnet = SyncNetEval(device=device)
syncnet.loadParameters("checkpoints/auxiliary/syncnet_v2.model")
detect_results_dir = os.path.join(process_temp_dir, "detect_results")
syncnet_eval_results_dir = os.path.join(process_temp_dir, "syncnet_eval_results")
syncnet_detector = SyncNetDetector(device=device, detect_results_dir=detect_results_dir)
for video_input, video_output in paths:
try:
av_offset, conf = syncnet_eval(
syncnet, syncnet_detector, video_input, syncnet_eval_results_dir, detect_results_dir
)
if conf >= sync_conf_threshold and abs(av_offset) <= 6:
os.makedirs(os.path.dirname(video_output), exist_ok=True)
if av_offset == 0:
shutil.copy(video_input, video_output)
else:
adjust_offset(video_input, video_output, av_offset)
except Exception as e:
print(e)
def split(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
def sync_av_multi_gpus(input_dir, output_dir, temp_dir, num_workers, sync_conf_threshold):
gather_paths(input_dir, output_dir)
num_devices = torch.cuda.device_count()
if num_devices == 0:
raise RuntimeError("No GPUs found")
split_paths = list(split(paths, num_workers * num_devices))
processes = []
for i in range(num_devices):
for j in range(num_workers):
process_index = i * num_workers + j
process = Process(
target=func,
args=(
sync_conf_threshold,
split_paths[process_index],
i,
os.path.join(temp_dir, f"process_{process_index}"),
),
)
process.start()
processes.append(process)
for process in processes:
process.join()
if __name__ == "__main__":
input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/ads/affine_transformed"
output_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/temp"
temp_dir = "temp"
num_workers = 20 # How many processes per device
sync_conf_threshold = 3
sync_av_multi_gpus(input_dir, output_dir, temp_dir, num_workers, sync_conf_threshold)
torch==2.2.2
torchvision==0.17.2
--extra-index-url https://download.pytorch.org/whl/cu121
xformers==0.0.26
triton==2.2.0
diffusers==0.11.1
transformers==4.38.0
huggingface-hub==0.25.2
imageio==2.27.0
decord==0.6.0
accelerate==0.26.1
einops==0.7.0
omegaconf==2.3.0
safetensors==0.4.2
opencv-python==4.9.0.80
mediapipe==0.10.11
av==11.0.0
torch-fidelity==0.3.0
torchmetrics==1.3.1
python_speech_features==0.6
librosa==0.10.1
scenedetect==0.6.1
ffmpeg-python==0.2.0
lpips==0.1.4
face-alignment==1.4.1
ninja==1.11.1.1
pandas==2.0.3
numpy==1.24.4
pydub
moviepy==1.0.3
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment