天天看點

FastSpeech2 代碼閱讀筆記——資料處理

作者:語音刺客

1. prepare_align.py

import argparse
 import yaml
 from preprocessor import ljspeech, aishell3, libritts
 
 # config為配置檔案中的内容,dataset為一個配置項,用以識别需要訓練的資料集
 def main(config):
     if "LJSpeech" in config["dataset"]:
         ljspeech.prepare_align(config)
     if "AISHELL3" in config["dataset"]:
         aishell3.prepare_align(config)
     if "LibriTTS" in config["dataset"]:
         libritts.prepare_align(config)
 
 
 if __name__ == "__main__":
     # 運作的時候加上一個參數config config為preprocess.yaml的路徑
     parser = argparse.ArgumentParser()
     parser.add_argument("config", type=str, help="path to preprocess.yaml")
     args = parser.parse_args()
 
     # config為preprocess.yaml的内容,傳入該檔案的路徑,讀取該檔案,yaml.FullLoader參數讀取全部yaml語言
     # 禁止執行任意函數,這樣 load() 函數也變得更加安全
     config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
     main(config)
            

2. preprocess.yaml

dataset: "LibriTTS"
 
 path:
   corpus_path: "/home/ming/Data/LibriTTS/train-clean-360"
   lexicon_path: "lexicon/librispeech-lexicon.txt"
   raw_path: "./raw_data/LibriTTS"
   preprocessed_path: "./preprocessed_data/LibriTTS"
 
 preprocessing:
   val_size: 512
   text:
     text_cleaners: ["english_cleaners"]
     language: "en"
   audio:
     sampling_rate: 22050
     max_wav_value: 32768.0
   stft:
     filter_length: 1024
     hop_length: 256
     win_length: 1024
   mel:
     n_mel_channels: 80
     mel_fmin: 0
     mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder
   pitch:
     feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
     normalization: True
   energy:
     feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
     normalization: True           

3. LibriTTS.py

import os
 import librosa
 import numpy as np
 from scipy.io import wavfile
 from tqdm import tqdm
 from text import _clean_text
 
 
 def prepare_align(config):
     in_dir = config["path"]["corpus_path"] # "/home/ming/Data/LibriTTS/train-clean-360"
     out_dir = config["path"]["raw_path"] # "./raw_data/LibriTTS"
     sampling_rate = config["preprocessing"]["audio"]["sampling_rate"] # 22050
     max_wav_value = config["preprocessing"]["audio"]["max_wav_value"] # 32768
     cleaners = config["preprocessing"]["text"]["text_cleaners"] # english_cleaners
     # os.listdir() 傳回指定目錄下的所有檔案名和目錄名
     # 檔案名為speaker
     for speaker in tqdm(os.listdir(in_dir)):
         # os.path.join()将in_dir與speaker連接配接起來,傳回目錄下的所有檔案名和目錄名
         # in_dir/speaker/chapter
         for chapter in os.listdir(os.path.join(in_dir, speaker)):
             # 傳回in_dir/speaker/chapter目錄下的所有檔案
             # 該目錄下的檔案包括三種:.normalized.txt  .wav  .original.txt
             # file_name檔案名
             for file_name in os.listdir(os.path.join(in_dir, speaker, chapter)):
                 if file_name[-4:] != ".wav":
                     continue
                 # 100_121669_000001_000000.normalized.txt
                 # 100_121669_000001_000000.original.txt
                 # 100_121669_000001_000000.wav
                 # 不是wav檔案就跳過,取wav檔案的檔案名,不取字尾
                 base_name = file_name[:-4]
                 # .normalized.txt檔案中存着一句英語句子,如Tom, the Piper's Son
                 text_path = os.path.join(
                     in_dir, speaker, chapter, "{}.normalized.txt".format(base_name)
                 )
                 wav_path = os.path.join(
                     in_dir, speaker, chapter, "{}.wav".format(base_name)
                 )
                 # 讀取文本内容,如text=Tom, the Piper's Son
                 with open(text_path) as f:
                     text = f.readline().strip("\n")
                     
                 # ######## ①
                 
                 # 亂碼處理、大小寫處理、縮寫展開、空格處理、數字處理
                 text = _clean_text(text, cleaners)
                 # 建立檔案夾out_dir/speaker,且目錄存在不會觸發目錄存在異常
                 os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
                 # librosa音頻信号處理庫函數
                 # load 從檔案加載音頻資料,而且可以通過參數設定是否保留雙聲道,采樣率,重采樣類型
                 # 傳回類型wav為numpy.ndarray  _為sampling_rate
                 wav, _ = librosa.load(wav_path, sampling_rate)
                 # wav = wav / (max(|wav|) * 32768)
                 # 歸一化,好處1,消除奇異樣本資料的影響,好處2,cond
                 wav = wav / max(abs(wav)) * max_wav_value # 32768.0  ???
 
                 # 将numpy格式的wav寫入到指定檔案中,out_dir/speaker/{base_name}.wav,sr,數值類型轉換
                 wavfile.write(
                     os.path.join(out_dir, speaker, "{}.wav".format(base_name)),
                     sampling_rate,
                     # 設定改類型是由ndarray中的數值大小範圍決定的,int16:-32768~32768
                     wav.astype(np.int16),
                 )
                 # 打開out_dir/speaker/{base_name}.lab,
                 # 将從{base_name}.normalized.txt檔案中讀取出來,然後經過處理的text寫入到{base_name}.lab檔案中
                 with open(
                     os.path.join(out_dir, speaker, "{}.lab".format(base_name)),
                     "w",
                 ) as f1:
                     f1.write(text)           

3.1 function() _text_clean() 調用文本處理

def _clean_text(text, cleaner_names):
     for name in cleaner_names:
         # getattr() 傳回cleaners的name屬性
         cleaner = getattr(cleaners, name)
         if not cleaner:
             raise Exception("Unknown cleaner: %s" % name)
         # cleaner = english_cleaners
         # 調用 def english_cleaners(text):
         text = cleaner(text)
     return text           

3.2 function english_cleaners(text) 文本處理

def english_cleaners(text):
     '''Pipeline for English text, including number and abbreviation expansion.'''
     text = convert_to_ascii(text)
     text = lowercase(text)
     # 數字處理
     text = expand_numbers(text)
     # 按照詞典将縮寫展開
     text = expand_abbreviations(text)
     # 将各種制表符,tab,\t,\n,使用空格替換
     text = collapse_whitespace(text)
     return text           

3.3 function expand_numbers(text) 文本中的數字處理

""" from https://github.com/keithito/tacotron """
 
 import inflect
 import re
 
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
 _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
 _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
 _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
 _number_re = re.compile(r"[0-9]+")
 
 def _remove_commas(m):
     return m.group(1).replace(",", "")
 
 def _expand_decimal_point(m):
     return m.group(1).replace(".", " point ")
 
 def _expand_dollars(m):
     match = m.group(1)
     parts = match.split(".")
     if len(parts) > 2:
         return match + " dollars"  # Unexpected format
     dollars = int(parts[0]) if parts[0] else 0
     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
     if dollars and cents:
         dollar_unit = "dollar" if dollars == 1 else "dollars"
         cent_unit = "cent" if cents == 1 else "cents"
         return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
     elif dollars:
         dollar_unit = "dollar" if dollars == 1 else "dollars"
         return "%s %s" % (dollars, dollar_unit)
     elif cents:
         cent_unit = "cent" if cents == 1 else "cents"
         return "%s %s" % (cents, cent_unit)
     else:
         return "zero dollars"
 
 
 def _expand_ordinal(m):
     return _inflect.number_to_words(m.group(0))
 
 
 def _expand_number(m):
     num = int(m.group(0))
     if num > 1000 and num < 3000:
         if num == 2000:
             return "two thousand"
         elif num > 2000 and num < 2010:
             return "two thousand " + _inflect.number_to_words(num % 100)
         elif num % 100 == 0:
             return _inflect.number_to_words(num // 100) + " hundred"
         else:
             return _inflect.number_to_words(
                 num, andword="", zero="oh", group=2
             ).replace(", ", " ")
     else:
         return _inflect.number_to_words(num, andword="")
 
 
 def normalize_numbers(text):
     # re.sub(正規表達式,與原内容中相比對的内容,原文本)
     # 将整數如3,3330中的,變成無字元,即3,3330 -> 33330
     text = re.sub(_comma_number_re, _remove_commas, text)
     # £3,333 -> 3333 pounds
     text = re.sub(_pounds_re, r"\1 pounds", text)
     # $333.3 ->333 dollars, 3 cents
     text = re.sub(_dollars_re, _expand_dollars, text)
     # 333.3 -> 333 point 3
     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
     # 17th -> seventeenth
     text = re.sub(_ordinal_re, _expand_ordinal, text)
     # num to word
     text = re.sub(_number_re, _expand_number, text)
     return text