Diffsptk
A differentiable version of SPTK
Install / Use
/learn @sp-nitech/DiffsptkREADME
diffsptk
diffsptk is a differentiable version of SPTK built on the PyTorch framework. It provides various speech signal processing modules as PyTorch layers, allowing users to integrate classic signal processing algorithms directly into neural network architectures and optimize them through backpropagation.
Requirements
- Python 3.10+
- PyTorch 2.3.1+
Documentation
- Reference Manual - Detailed API documentation and module specifications.
- Interactive Tutorial (Google Colab) - Hands-on examples to get started with
diffsptkin your browser. - Conference Paper - Technical background and implementation details available on the ISCA Archive.
Installation
The latest stable release can be installed via PyPI:
pip install diffsptk
Alternatively, the development version can be installed directly from the GitHub repository:
pip install git+https://github.com/sp-nitech/diffsptk.git@master
Examples
Running on a GPU
import diffsptk
stft_params = {"frame_length": 400, "frame_period": 80, "fft_length": 512}
# Read waveform.
x, sr = diffsptk.read("assets/data.wav", device="cuda")
# Compute spectrogram using a nn.Module class.
X1 = diffsptk.STFT(**stft_params, device="cuda")(x)
# Compute spectrogram using a functional method.
X2 = diffsptk.functional.stft(x, **stft_params)
print(X1.allclose(X2))
Mel-cepstral analysis and synthesis
import diffsptk
fl = 400 # Frame length.
fp = 80 # Frame period.
n_fft = 512 # FFT length.
M = 24 # Mel-cepstrum dimensions.
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Compute STFT amplitude of x.
stft = diffsptk.STFT(frame_length=fl, frame_period=fp, fft_length=n_fft)
X = stft(x)
# Estimate mel-cepstrum of x.
alpha = diffsptk.get_alpha(sr)
mcep = diffsptk.MelCepstralAnalysis(
fft_length=n_fft,
cep_order=M,
alpha=alpha,
n_iter=10,
)
mc = mcep(X)
# Reconstruct x.
mlsa = diffsptk.MLSA(filter_order=M, frame_period=fp, alpha=alpha, taylor_order=20)
x_hat = mlsa(mlsa(x, -mc), mc)
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
# Extract pitch of x.
pitch = diffsptk.Pitch(
frame_period=fp,
sample_rate=sr,
f_min=80,
f_max=180,
voicing_threshold=0.4,
out_format="pitch",
)
p = pitch(x)
# Generate excitation signal.
excite = diffsptk.ExcitationGeneration(frame_period=fp)
e = excite(p)
n = diffsptk.nrand(x.size(0) - 1)
# Synthesize waveform.
x_voiced = mlsa(e, mc)
x_unvoiced = mlsa(n, mc)
# Output analysis-synthesis result.
diffsptk.write("voiced.wav", x_voiced, sr)
diffsptk.write("unvoiced.wav", x_unvoiced, sr)
WORLD analysis and synthesis
import diffsptk
fp = 80 # Frame period.
n_fft = 1024 # FFT length.
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Extract F0 of x, or prepare well-estimated F0.
pitch = diffsptk.Pitch(
frame_period=fp,
sample_rate=sr,
f_min=80,
f_max=180,
voicing_threshold=0.4,
out_format="f0",
)
f0 = pitch(x)
# Extract aperiodicity of x by D4C.
ap = diffsptk.Aperiodicity(
frame_period=fp,
sample_rate=sr,
fft_length=n_fft,
algorithm="d4c",
out_format="a",
)
A = ap(x, f0)
# Extract spectral envelope of x by CheapTrick.
pitch_spec = diffsptk.PitchAdaptiveSpectralAnalysis(
frame_period=fp,
sample_rate=sr,
fft_length=n_fft,
algorithm="cheap-trick",
out_format="power",
)
S = pitch_spec(x, f0)
# Reconstruct x.
world_synth = diffsptk.WorldSynthesis(
frame_period=fp,
sample_rate=sr,
fft_length=n_fft,
)
x_hat = world_synth(f0, A, S)
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
LPC analysis and synthesis
import diffsptk
fl = 400 # Frame length.
fp = 80 # Frame period.
M = 24 # LPC dimensions.
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Estimate LPC of x.
frame = diffsptk.Frame(frame_length=fl, frame_period=fp)
window = diffsptk.Window(in_length=fl)
lpc = diffsptk.LPC(frame_length=fl, lpc_order=M, eps=1e-5)
a = lpc(window(frame(x)))
# Convert to inverse filter coefficients.
norm0 = diffsptk.AllPoleToAllZeroDigitalFilterCoefficients(filter_order=M)
b = norm0(a)
# Reconstruct x.
zerodf = diffsptk.AllZeroDigitalFilter(filter_order=M, frame_period=fp)
poledf = diffsptk.AllPoleDigitalFilter(filter_order=M, frame_period=fp)
x_hat = poledf(zerodf(x, b), a)
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
Mel spectrogram analysis and synthesis
import diffsptk
fl = 400 # Frame length.
fp = 80 # Frame period.
n_fft = 512 # FFT length.
n_channel = 128 # Number of channels.
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Compute STFT amplitude of x.
stft = diffsptk.STFT(frame_length=fl, frame_period=fp, fft_length=n_fft)
X = stft(x)
# Extract log-mel spectrogram.
fbank = diffsptk.FBANK(
fft_length=n_fft,
n_channel=n_channel,
sample_rate=sr,
)
Y = fbank(X)
# Reconstruct linear spectrogram.
ifbank = diffsptk.IFBANK(
n_channel=n_channel,
fft_length=n_fft,
sample_rate=sr,
)
X_hat = ifbank(Y)
# Reconstruct x.
griffin = diffsptk.GriffinLim(
frame_length=fl,
frame_period=fp,
fft_length=n_fft,
)
x_hat = griffin(X_hat, out_length=x.size(0))
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
Subband decomposition
import diffsptk
K = 4 # Number of subbands.
M = 40 # Order of filter.
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Decompose x.
pqmf = diffsptk.PQMF(K, M)
decimate = diffsptk.Decimation(K)
y = decimate(pqmf(x))
# Reconstruct x.
interpolate = diffsptk.Interpolation(K)
ipqmf = diffsptk.IPQMF(K, M)
x_hat = ipqmf(interpolate(K * y)).reshape(-1)
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
Gammatone filter bank analysis and synthesis
import diffsptk
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Decompose x.
gammatone = diffsptk.GammatoneFilterBankAnalysis(sr)
y = gammatone(x)
# Reconstruct x.
igammatone = diffsptk.GammatoneFilterBankSynthesis(sr)
x_hat = igammatone(y).reshape(-1)
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
Fractional octave band analysis and synthesis
import diffsptk
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Decompose x.
oband = diffsptk.FractionalOctaveBandAnalysis(sr)
y = oband(x)
# Reconstruct x.
x_hat = y.sum(1).reshape(-1)
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
Constant-Q transform
import diffsptk
import librosa # This is to get sample audio.
fp = 128 # Frame period.
K = 252 # Number of CQ-bins.
B = 36 # Number of bins per octave.
# Read waveform.
x, sr = diffsptk.read(librosa.ex("trumpet"))
# Transform x.
cqt = diffsptk.CQT(fp, sr, n_bin=K, n_bin_per_octave=B)
c = cqt(x)
# Reconstruct x.
icqt = diffsptk.ICQT(fp, sr, n_bin=K, n_bin_per_octave=B)
x_hat = icqt(c, out_length=x.size(0))
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
Modified discrete cosine transform
import diffsptk
fl = 512 # Frame length.
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Transform x.
mdct = diffsptk.MDCT(fl)
c = mdct(x)
# Reconstruct x.
imdct = diffsptk.IMDCT(fl)
x_hat = imdct(c, out_length=x.size(0))
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
Vector quantization
import diffsptk
K = 2 # Codebook size.
M = 4 # Order of vector.
# Prepare input.
x = diffsptk.nrand(M)
# Quantize x.
vq = diffsptk.VectorQuantization(M, K)
x_hat, indices, commitment_loss = vq(x)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
License
This software is released under the Apache License 2.0.
Citation
@InProceedings{s
Related Skills
claude-opus-4-5-migration
84.4kMigrate prompts and code from Claude Sonnet 4.0, Sonnet 4.5, or Opus 4.1 to Opus 4.5
model-usage
341.0kUse CodexBar CLI local cost usage to summarize per-model usage for Codex or Claude, including the current (most recent) model or a full model breakdown. Trigger when asked for model-level usage/cost data from codexbar, or when you need a scriptable per-model summary from codexbar cost JSON.
TrendRadar
50.0k⭐AI-driven public opinion & trend monitor with multi-platform aggregation, RSS, and smart alerts.🎯 告别信息过载,你的 AI 舆情监控助手与热点筛选工具!聚合多平台热点 + RSS 订阅,支持关键词精准筛选。AI 智能筛选新闻 + AI 翻译 + AI 分析简报直推手机,也支持接入 MCP 架构,赋能 AI 自然语言对话分析、情感洞察与趋势预测等。支持 Docker ,数据本地/云端自持。集成微信/飞书/钉钉/Telegram/邮件/ntfy/bark/slack 等渠道智能推送。
mcp-for-beginners
15.7kThis open-source curriculum introduces the fundamentals of Model Context Protocol (MCP) through real-world, cross-language examples in .NET, Java, TypeScript, JavaScript, Rust and Python. Designed for developers, it focuses on practical techniques for building modular, scalable, and secure AI workflows from session setup to service orchestration.
