@article{sheng_huang_pavlovskiy_2019, 
  title={High-quality Speech Synthesis Using Super-resolution Mel-Spectrogram}, 
  abstractNote={In speech synthesis and speech enhancement systems, melspectrograms need to
be precise in acoustic representations. However, the generated spectrograms are
over-smooth, that could not produce high quality synthesized speech. Inspired
by image-to-image translation, we address this problem by using a
learning-based post filter combining Pix2PixHD and ResUnet to reconstruct the
mel-spectrograms together with super-resolution. From the resulting
super-resolution spectrogram networks, we can generate enhanced spectrograms to
produce high quality synthesized speech. Our proposed model achieves improved
mean opinion scores (MOS) of 3.71 and 4.01 over baseline results of 3.29 and
3.84, while using vocoder Griffin-Lim and WaveNet, respectively.}, 
  author={Sheng and Huang and Pavlovskiy}, 
  year={2019}, 
  month={Dec}
  }