1. prepare_align.py
import argparse
import yaml
from preprocessor import ljspeech, aishell3, libritts
# config為配置檔案中的内容,dataset為一個配置項,用以識别需要訓練的資料集
def main(config):
if "LJSpeech" in config["dataset"]:
ljspeech.prepare_align(config)
if "AISHELL3" in config["dataset"]:
aishell3.prepare_align(config)
if "LibriTTS" in config["dataset"]:
libritts.prepare_align(config)
if __name__ == "__main__":
# 運作的時候加上一個參數config config為preprocess.yaml的路徑
parser = argparse.ArgumentParser()
parser.add_argument("config", type=str, help="path to preprocess.yaml")
args = parser.parse_args()
# config為preprocess.yaml的内容,傳入該檔案的路徑,讀取該檔案,yaml.FullLoader參數讀取全部yaml語言
# 禁止執行任意函數,這樣 load() 函數也變得更加安全
config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
main(config)
2. preprocess.yaml
dataset: "LibriTTS"
path:
corpus_path: "/home/ming/Data/LibriTTS/train-clean-360"
lexicon_path: "lexicon/librispeech-lexicon.txt"
raw_path: "./raw_data/LibriTTS"
preprocessed_path: "./preprocessed_data/LibriTTS"
preprocessing:
val_size: 512
text:
text_cleaners: ["english_cleaners"]
language: "en"
audio:
sampling_rate: 22050
max_wav_value: 32768.0
stft:
filter_length: 1024
hop_length: 256
win_length: 1024
mel:
n_mel_channels: 80
mel_fmin: 0
mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder
pitch:
feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
normalization: True
energy:
feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
normalization: True
3. LibriTTS.py
import os
import librosa
import numpy as np
from scipy.io import wavfile
from tqdm import tqdm
from text import _clean_text
def prepare_align(config):
in_dir = config["path"]["corpus_path"] # "/home/ming/Data/LibriTTS/train-clean-360"
out_dir = config["path"]["raw_path"] # "./raw_data/LibriTTS"
sampling_rate = config["preprocessing"]["audio"]["sampling_rate"] # 22050
max_wav_value = config["preprocessing"]["audio"]["max_wav_value"] # 32768
cleaners = config["preprocessing"]["text"]["text_cleaners"] # english_cleaners
# os.listdir() 傳回指定目錄下的所有檔案名和目錄名
# 檔案名為speaker
for speaker in tqdm(os.listdir(in_dir)):
# os.path.join()将in_dir與speaker連接配接起來,傳回目錄下的所有檔案名和目錄名
# in_dir/speaker/chapter
for chapter in os.listdir(os.path.join(in_dir, speaker)):
# 傳回in_dir/speaker/chapter目錄下的所有檔案
# 該目錄下的檔案包括三種:.normalized.txt .wav .original.txt
# file_name檔案名
for file_name in os.listdir(os.path.join(in_dir, speaker, chapter)):
if file_name[-4:] != ".wav":
continue
# 100_121669_000001_000000.normalized.txt
# 100_121669_000001_000000.original.txt
# 100_121669_000001_000000.wav
# 不是wav檔案就跳過,取wav檔案的檔案名,不取字尾
base_name = file_name[:-4]
# .normalized.txt檔案中存着一句英語句子,如Tom, the Piper's Son
text_path = os.path.join(
in_dir, speaker, chapter, "{}.normalized.txt".format(base_name)
)
wav_path = os.path.join(
in_dir, speaker, chapter, "{}.wav".format(base_name)
)
# 讀取文本内容,如text=Tom, the Piper's Son
with open(text_path) as f:
text = f.readline().strip("\n")
# ######## ①
# 亂碼處理、大小寫處理、縮寫展開、空格處理、數字處理
text = _clean_text(text, cleaners)
# 建立檔案夾out_dir/speaker,且目錄存在不會觸發目錄存在異常
os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
# librosa音頻信号處理庫函數
# load 從檔案加載音頻資料,而且可以通過參數設定是否保留雙聲道,采樣率,重采樣類型
# 傳回類型wav為numpy.ndarray _為sampling_rate
wav, _ = librosa.load(wav_path, sampling_rate)
# wav = wav / (max(|wav|) * 32768)
# 歸一化,好處1,消除奇異樣本資料的影響,好處2,cond
wav = wav / max(abs(wav)) * max_wav_value # 32768.0 ???
# 将numpy格式的wav寫入到指定檔案中,out_dir/speaker/{base_name}.wav,sr,數值類型轉換
wavfile.write(
os.path.join(out_dir, speaker, "{}.wav".format(base_name)),
sampling_rate,
# 設定改類型是由ndarray中的數值大小範圍決定的,int16:-32768~32768
wav.astype(np.int16),
)
# 打開out_dir/speaker/{base_name}.lab,
# 将從{base_name}.normalized.txt檔案中讀取出來,然後經過處理的text寫入到{base_name}.lab檔案中
with open(
os.path.join(out_dir, speaker, "{}.lab".format(base_name)),
"w",
) as f1:
f1.write(text)
3.1 function() _text_clean() 調用文本處理
def _clean_text(text, cleaner_names):
for name in cleaner_names:
# getattr() 傳回cleaners的name屬性
cleaner = getattr(cleaners, name)
if not cleaner:
raise Exception("Unknown cleaner: %s" % name)
# cleaner = english_cleaners
# 調用 def english_cleaners(text):
text = cleaner(text)
return text
3.2 function english_cleaners(text) 文本處理
def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = lowercase(text)
# 數字處理
text = expand_numbers(text)
# 按照詞典将縮寫展開
text = expand_abbreviations(text)
# 将各種制表符,tab,\t,\n,使用空格替換
text = collapse_whitespace(text)
return text
3.3 function expand_numbers(text) 文本中的數字處理
""" from https://github.com/keithito/tacotron """
import inflect
import re
_inflect = inflect.engine()
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
_number_re = re.compile(r"[0-9]+")
def _remove_commas(m):
return m.group(1).replace(",", "")
def _expand_decimal_point(m):
return m.group(1).replace(".", " point ")
def _expand_dollars(m):
match = m.group(1)
parts = match.split(".")
if len(parts) > 2:
return match + " dollars" # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = "dollar" if dollars == 1 else "dollars"
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = "dollar" if dollars == 1 else "dollars"
return "%s %s" % (dollars, dollar_unit)
elif cents:
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s" % (cents, cent_unit)
else:
return "zero dollars"
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return "two thousand"
elif num > 2000 and num < 2010:
return "two thousand " + _inflect.number_to_words(num % 100)
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + " hundred"
else:
return _inflect.number_to_words(
num, andword="", zero="oh", group=2
).replace(", ", " ")
else:
return _inflect.number_to_words(num, andword="")
def normalize_numbers(text):
# re.sub(正規表達式,與原内容中相比對的内容,原文本)
# 将整數如3,3330中的,變成無字元,即3,3330 -> 33330
text = re.sub(_comma_number_re, _remove_commas, text)
# £3,333 -> 3333 pounds
text = re.sub(_pounds_re, r"\1 pounds", text)
# $333.3 ->333 dollars, 3 cents
text = re.sub(_dollars_re, _expand_dollars, text)
# 333.3 -> 333 point 3
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
# 17th -> seventeenth
text = re.sub(_ordinal_re, _expand_ordinal, text)
# num to word
text = re.sub(_number_re, _expand_number, text)
return text