83 lines
3.2 KiB
C#
83 lines
3.2 KiB
C#
using System.Text.RegularExpressions;
|
|
using System.Text;
|
|
|
|
namespace YTExtractor.Services
|
|
{
|
|
public class ConvertTranscriptService
|
|
{
|
|
public string ExtractPlainText(string vttContent)
|
|
{
|
|
// Remove o cabeçalho WEBVTT
|
|
vttContent = Regex.Replace(vttContent, @"^WEBVTT.*?\n\n", "", RegexOptions.Singleline);
|
|
|
|
// Remove marcações de tempo (00:00:00.000 --> 00:00:01.870)
|
|
vttContent = Regex.Replace(vttContent, @"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*?\n", " ");
|
|
|
|
// Remove marcações de estilo <00:00:00.280><c>
|
|
vttContent = Regex.Replace(vttContent, @"<\d{2}:\d{2}:\d{2}\.\d{3}>(<c>)?", "");
|
|
|
|
// Remove linhas vazias e espaços extras
|
|
vttContent = Regex.Replace(vttContent, @"\n\s*\n", "\n");
|
|
vttContent = Regex.Replace(vttContent, @"\s+", " ");
|
|
|
|
// Remove linhas que contêm apenas posicionamento (align:start position:0%)
|
|
vttContent = Regex.Replace(vttContent, @"align:start position:0%", "");
|
|
|
|
// Remove marcações de [Música]
|
|
vttContent = Regex.Replace(vttContent, @"\[Música\]", "");
|
|
|
|
// Remove qualquer tag HTML remanescente
|
|
vttContent = Regex.Replace(vttContent, @"<[^>]+>", "");
|
|
|
|
// Limpa múltiplos espaços e organiza o texto
|
|
vttContent = Regex.Replace(vttContent, @"\s+", " ").Trim();
|
|
|
|
return vttContent;
|
|
}
|
|
|
|
public string ConvertToSrt(string vttContent)
|
|
{
|
|
// Extrai blocos de legendas
|
|
var matches = Regex.Matches(vttContent, @"(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3}).*?\n(.*?)(?=\n\d{2}:\d{2}:\d{2}\.\d{3}|$)",
|
|
RegexOptions.Singleline);
|
|
|
|
var srtBuilder = new StringBuilder();
|
|
int index = 1;
|
|
|
|
foreach (Match match in matches)
|
|
{
|
|
if (match.Groups.Count >= 4)
|
|
{
|
|
string startTime = ConvertVttTimeToSrtTime(match.Groups[1].Value);
|
|
string endTime = ConvertVttTimeToSrtTime(match.Groups[2].Value);
|
|
string text = match.Groups[3].Value;
|
|
|
|
// Limpa o texto
|
|
text = Regex.Replace(text, @"<\d{2}:\d{2}:\d{2}\.\d{3}>(<c>)?", "");
|
|
text = Regex.Replace(text, @"<[^>]+>", "");
|
|
text = Regex.Replace(text, @"\[Música\]", "");
|
|
text = Regex.Replace(text, @"align:start position:0%", "");
|
|
text = Regex.Replace(text, @"\s+", " ").Trim();
|
|
|
|
if (!string.IsNullOrWhiteSpace(text))
|
|
{
|
|
srtBuilder.AppendLine(index.ToString());
|
|
srtBuilder.AppendLine($"{startTime} --> {endTime}");
|
|
srtBuilder.AppendLine(text);
|
|
srtBuilder.AppendLine();
|
|
index++;
|
|
}
|
|
}
|
|
}
|
|
|
|
return srtBuilder.ToString();
|
|
}
|
|
|
|
private string ConvertVttTimeToSrtTime(string vttTime)
|
|
{
|
|
// Converte formato de tempo do VTT (00:00:00.000) para SRT (00:00:00,000)
|
|
return vttTime.Replace(".", ",");
|
|
}
|
|
}
|
|
}
|