YTExtractor/YTExtractor/Services/ConvertTranscriptService.cs
2025-04-26 11:52:13 -03:00

83 lines
3.2 KiB
C#

using System.Text.RegularExpressions;
using System.Text;
namespace YTExtractor.Services
{
public class ConvertTranscriptService
{
public string ExtractPlainText(string vttContent)
{
// Remove o cabeçalho WEBVTT
vttContent = Regex.Replace(vttContent, @"^WEBVTT.*?\n\n", "", RegexOptions.Singleline);
// Remove marcações de tempo (00:00:00.000 --> 00:00:01.870)
vttContent = Regex.Replace(vttContent, @"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*?\n", " ");
// Remove marcações de estilo <00:00:00.280><c>
vttContent = Regex.Replace(vttContent, @"<\d{2}:\d{2}:\d{2}\.\d{3}>(<c>)?", "");
// Remove linhas vazias e espaços extras
vttContent = Regex.Replace(vttContent, @"\n\s*\n", "\n");
vttContent = Regex.Replace(vttContent, @"\s+", " ");
// Remove linhas que contêm apenas posicionamento (align:start position:0%)
vttContent = Regex.Replace(vttContent, @"align:start position:0%", "");
// Remove marcações de [Música]
vttContent = Regex.Replace(vttContent, @"\[Música\]", "");
// Remove qualquer tag HTML remanescente
vttContent = Regex.Replace(vttContent, @"<[^>]+>", "");
// Limpa múltiplos espaços e organiza o texto
vttContent = Regex.Replace(vttContent, @"\s+", " ").Trim();
return vttContent;
}
public string ConvertToSrt(string vttContent)
{
// Extrai blocos de legendas
var matches = Regex.Matches(vttContent, @"(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3}).*?\n(.*?)(?=\n\d{2}:\d{2}:\d{2}\.\d{3}|$)",
RegexOptions.Singleline);
var srtBuilder = new StringBuilder();
int index = 1;
foreach (Match match in matches)
{
if (match.Groups.Count >= 4)
{
string startTime = ConvertVttTimeToSrtTime(match.Groups[1].Value);
string endTime = ConvertVttTimeToSrtTime(match.Groups[2].Value);
string text = match.Groups[3].Value;
// Limpa o texto
text = Regex.Replace(text, @"<\d{2}:\d{2}:\d{2}\.\d{3}>(<c>)?", "");
text = Regex.Replace(text, @"<[^>]+>", "");
text = Regex.Replace(text, @"\[Música\]", "");
text = Regex.Replace(text, @"align:start position:0%", "");
text = Regex.Replace(text, @"\s+", " ").Trim();
if (!string.IsNullOrWhiteSpace(text))
{
srtBuilder.AppendLine(index.ToString());
srtBuilder.AppendLine($"{startTime} --> {endTime}");
srtBuilder.AppendLine(text);
srtBuilder.AppendLine();
index++;
}
}
}
return srtBuilder.ToString();
}
private string ConvertVttTimeToSrtTime(string vttTime)
{
// Converte formato de tempo do VTT (00:00:00.000) para SRT (00:00:00,000)
return vttTime.Replace(".", ",");
}
}
}