fix: transcrição mais legível e sem tags

This commit is contained in:
Ricardo Carneiro 2025-04-26 11:52:13 -03:00
parent 242b4596ff
commit 8d8ac63a80
11 changed files with 717 additions and 23 deletions

View File

@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.11.35327.3
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "YTExtractor", "YTExtractor\YTExtractor.csproj", "{7DA7D783-153F-42EF-87E4-239DEC80F91A}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "YTExtractor", "YTExtractor\YTExtractor.csproj", "{7DA7D783-153F-42EF-87E4-239DEC80F91A}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution

400
YTExtractor/.gitignore vendored Normal file
View File

@ -0,0 +1,400 @@
# ---> VisualStudio
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Mono auto generated files
mono_crash.*
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Ww][Ii][Nn]32/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Ll]ogs/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# ASP.NET Scaffolding
ScaffoldingReadMe.txt
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.log
*.tlog
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Coverlet is a free, cross platform Code Coverage Tool
coverage*.json
coverage*.xml
coverage*.info
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio 6 auto-generated project file (contains which files were open etc.)
*.vbp
# Visual Studio 6 workspace and project file (working project files containing files to include in project)
*.dsw
*.dsp
# Visual Studio 6 technical files
*.ncb
*.aps
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# CodeRush personal settings
.cr/personal
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
# Visual Studio History (VSHistory) files
.vshistory/
# BeatPulse healthcheck temp database
healthchecksdb
# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/
# Ionide (cross platform F# VS Code tools) working folder
.ionide/
# Fody - auto-generated XML schema
FodyWeavers.xsd
# VS Code files for those working on multiple tools
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace
# Local History for Visual Studio Code
.history/
# Windows Installer files from build outputs
*.cab
*.msi
*.msix
*.msm
*.msp
# JetBrains Rider
*.sln.iml

View File

@ -9,7 +9,7 @@ namespace YTExtractor.Data
public MongoDBConnector(IConfiguration configuration)
{
var connectionString = configuration.GetSection("MongoDbConnaction").Value;
var connectionString = configuration.GetSection("MongoDbConnection").Value;
var client = new MongoClient(connectionString);
_database = client.GetDatabase("YTExtractor");
_collection = _database.GetCollection<VideoData>("videos");

View File

@ -0,0 +1,30 @@
using Microsoft.Extensions.Configuration;
using Serilog.Events;
using Serilog;
using Serilog.Extensions.Hosting;
namespace YTExtractor.Logging.Configuration
{
public static class SerilogConfiguration
{
public static LoggerConfiguration SetLoggerConfiguration(this WebApplicationBuilder builder, LoggerConfiguration config, IServiceProvider services, IConfiguration configuration)
{
var workspace = configuration["Serilog:Properties:Workspace"];
var seqServer = configuration.GetValue<string>("Serilog:WriteTo:2:Args:serverUrl"); ;
config
.ReadFrom.Configuration(configuration)
.ReadFrom.Services(services)
.Enrich.FromLogContext()
.Enrich.WithEnvironmentName()
//.Enrich.WithMachineName()
.Enrich.WithProperty("Application", "SumaTube")
.Enrich.WithProperty("Workspace", workspace)
.WriteTo.Seq(seqServer)
;
return config;
}
}
}

View File

@ -0,0 +1,28 @@
using Microsoft.Extensions.Logging;
namespace SumaTube.Crosscutting.Logging.Extensions
{
public static class LoggerExtensions
{
public static void LogMethodEntry<T>(this ILogger<T> logger, string methodName, params object[] parameters)
{
logger.LogInformation("Entering method {MethodName} with parameters {@Parameters}", methodName, parameters);
}
public static void LogMethodExit<T>(this ILogger<T> logger, string methodName, object result = null)
{
logger.LogInformation("Exiting method {MethodName} with result {@Result}", methodName, result);
}
public static void LogException<T>(this ILogger<T> logger, Exception exception, string message = null)
{
logger.LogError(exception, message ?? "An error occurred: {ErrorMessage}", exception.Message);
}
public static void LogPerformance<T>(this ILogger<T> logger, string operation, long elapsedMilliseconds)
{
logger.LogInformation("Performance: {Operation} took {ElapsedMilliseconds} ms", operation, elapsedMilliseconds);
}
}
}

View File

@ -1,20 +1,18 @@
using YTExtractor;
using Serilog;
using Serilog.Sinks.InfluxDB;
using YTExtractor.Data;
using YTExtractor.Logging.Configuration;
using YTExtractor.Services;
// App configuration and endpoints
var builder = WebApplication.CreateBuilder(args);
var environment = builder.Environment.EnvironmentName;
Log.Logger = new LoggerConfiguration()
.WriteTo.InfluxDB(
address: "http://192.168.0.76",
dbName: "telegraf",
source: "YTExtractor-{environment}")
.Enrich.WithProperty("Environment", environment)
.CreateLogger();
builder.Host.UseSerilog((context, services, configuration) =>
{
builder.SetLoggerConfiguration(configuration, services, context.Configuration);
});
builder.Services.AddEndpointsApiExplorer();
builder.Services.AddSwaggerGen();
@ -38,7 +36,7 @@ app.MapPost("/api/video-info", async (VideoRequest request, MongoDBConnector mon
try
{
Log.Information($"Obtendo dados do video: {request.Url}");
var service = new ConvertTranscriptService();
var videoExists = await mongo.GetVideoByUrl(request.Url);
if (videoExists != null)
{
@ -46,15 +44,16 @@ app.MapPost("/api/video-info", async (VideoRequest request, MongoDBConnector mon
videoExists.Url,
videoExists.Titulo,
videoExists.ThumbnailUrl,
videoExists.TranscText
service.ExtractPlainText(videoExists.TranscText)
));
}
var info = await YoutubeService.GetVideoInfo(request.Url, tempDir);
var subtitles = await YoutubeService.GetSubtitles(request.Url, request.Language, tempDir);
var subtitles = service.ExtractPlainText(await YoutubeService.GetSubtitles(request.Url, request.Language, tempDir));
await mongo.InsertVideo(new VideoData
{
Id = Guid.NewGuid().ToString(),
Url = request.Url,
Titulo = info.Title,
ThumbnailUrl = info.ThumbnailUrl,

View File

@ -0,0 +1,82 @@
using System.Text.RegularExpressions;
using System.Text;
namespace YTExtractor.Services
{
public class ConvertTranscriptService
{
public string ExtractPlainText(string vttContent)
{
// Remove o cabeçalho WEBVTT
vttContent = Regex.Replace(vttContent, @"^WEBVTT.*?\n\n", "", RegexOptions.Singleline);
// Remove marcações de tempo (00:00:00.000 --> 00:00:01.870)
vttContent = Regex.Replace(vttContent, @"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*?\n", " ");
// Remove marcações de estilo <00:00:00.280><c>
vttContent = Regex.Replace(vttContent, @"<\d{2}:\d{2}:\d{2}\.\d{3}>(<c>)?", "");
// Remove linhas vazias e espaços extras
vttContent = Regex.Replace(vttContent, @"\n\s*\n", "\n");
vttContent = Regex.Replace(vttContent, @"\s+", " ");
// Remove linhas que contêm apenas posicionamento (align:start position:0%)
vttContent = Regex.Replace(vttContent, @"align:start position:0%", "");
// Remove marcações de [Música]
vttContent = Regex.Replace(vttContent, @"\[Música\]", "");
// Remove qualquer tag HTML remanescente
vttContent = Regex.Replace(vttContent, @"<[^>]+>", "");
// Limpa múltiplos espaços e organiza o texto
vttContent = Regex.Replace(vttContent, @"\s+", " ").Trim();
return vttContent;
}
public string ConvertToSrt(string vttContent)
{
// Extrai blocos de legendas
var matches = Regex.Matches(vttContent, @"(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3}).*?\n(.*?)(?=\n\d{2}:\d{2}:\d{2}\.\d{3}|$)",
RegexOptions.Singleline);
var srtBuilder = new StringBuilder();
int index = 1;
foreach (Match match in matches)
{
if (match.Groups.Count >= 4)
{
string startTime = ConvertVttTimeToSrtTime(match.Groups[1].Value);
string endTime = ConvertVttTimeToSrtTime(match.Groups[2].Value);
string text = match.Groups[3].Value;
// Limpa o texto
text = Regex.Replace(text, @"<\d{2}:\d{2}:\d{2}\.\d{3}>(<c>)?", "");
text = Regex.Replace(text, @"<[^>]+>", "");
text = Regex.Replace(text, @"\[Música\]", "");
text = Regex.Replace(text, @"align:start position:0%", "");
text = Regex.Replace(text, @"\s+", " ").Trim();
if (!string.IsNullOrWhiteSpace(text))
{
srtBuilder.AppendLine(index.ToString());
srtBuilder.AppendLine($"{startTime} --> {endTime}");
srtBuilder.AppendLine(text);
srtBuilder.AppendLine();
index++;
}
}
}
return srtBuilder.ToString();
}
private string ConvertVttTimeToSrtTime(string vttTime)
{
// Converte formato de tempo do VTT (00:00:00.000) para SRT (00:00:00,000)
return vttTime.Replace(".", ",");
}
}
}

View File

@ -7,11 +7,18 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Google.Apis.YouTube.v3" Version="1.69.0.3707" />
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="8.0.12" />
<PackageReference Include="MongoDB.Bson" Version="3.1.0" />
<PackageReference Include="MongoDB.Driver" Version="3.1.0" />
<PackageReference Include="Serilog" Version="4.2.1-dev-02337" />
<PackageReference Include="Serilog.Sinks.InfluxDB.DotNetCore" Version="1.0.2" />
<PackageReference Include="Serilog.AspNetCore" Version="9.0.0" />
<PackageReference Include="Serilog.Enrichers.Context" Version="4.6.5" />
<PackageReference Include="Serilog.Enrichers.Environment" Version="3.0.1" />
<PackageReference Include="Serilog.Enrichers.Thread" Version="4.0.0" />
<PackageReference Include="Serilog.Extensions.Hosting" Version="9.0.0" />
<PackageReference Include="Serilog.Settings.Configuration" Version="9.0.0" />
<PackageReference Include="Serilog.Sinks.Console" Version="6.0.0" />
<PackageReference Include="Serilog.Sinks.Seq" Version="9.0.0" />
<PackageReference Include="Swashbuckle.AspNetCore" Version="6.6.2" />
</ItemGroup>

View File

@ -0,0 +1,115 @@
using Google.Apis.Auth.OAuth2;
using Google.Apis.Services;
using Google.Apis.YouTube.v3;
using Google.Apis.YouTube.v3.Data;
using Microsoft.Extensions.Configuration;
using System.Diagnostics;
using System.Reflection;
using System.Runtime.InteropServices;
using System.Text.RegularExpressions;
namespace YouTubeAPIClient
{
public class YouTubeDataService
{
private readonly string _apiKey;
private readonly YouTubeService _youtubeService;
public YouTubeDataService(string apiKey)
{
_apiKey = apiKey;
_youtubeService = new YouTubeService(new BaseClientService.Initializer()
{
ApiKey = apiKey,
ApplicationName = "YouTubeAPIClient"
});
}
public static bool IsValidYouTubeUrl(string urlx)
{
return Regex.IsMatch(urlx, @"^(https?\:\/\/)?(www\.)?(youtube\.com|youtu\.?be)\/.+$");
}
public static async Task<YtDlpInfo> GetVideoInfo(string url, string workingDir)
{
string videoId = ExtractVideoId(url);
if (string.IsNullOrEmpty(videoId))
{
throw new ArgumentException("URL inválida ou não foi possível extrair o ID do vídeo");
}
var youtubeService = new YouTubeService(new BaseClientService.Initializer
{
ApiKey = Environment.GetEnvironmentVariable("YOUTUBE_API_KEY"),
ApplicationName = "YouTubeAPIClient"
});
var videoRequest = youtubeService.Videos.List("snippet");
videoRequest.Id = videoId;
var response = await videoRequest.ExecuteAsync();
if (response.Items.Count == 0)
throw new Exception("Vídeo não encontrado");
var video = response.Items[0];
var thumbnailUrl = video.Snippet.Thumbnails.High?.Url ??
video.Snippet.Thumbnails.Medium?.Url ??
video.Snippet.Thumbnails.Default__?.Url ?? "";
return new YtDlpInfo(
video.Snippet.Title,
thumbnailUrl
);
}
public static async Task<string> GetSubtitles(string url, string language, string workingDir)
{
string videoId = ExtractVideoId(url);
if (string.IsNullOrEmpty(videoId))
{
throw new ArgumentException("URL inválida ou não foi possível extrair o ID do vídeo");
}
var youtubeService = new YouTubeService(new BaseClientService.Initializer
{
ApiKey = Environment.GetEnvironmentVariable("YOUTUBE_API_KEY"),
ApplicationName = "YouTubeAPIClient"
});
var captionRequest = youtubeService.Captions.List("snippet", videoId);
var captionResponse = await captionRequest.ExecuteAsync();
var caption = captionResponse.Items.FirstOrDefault(c =>
c.Snippet.Language.ToLower() == language.ToLower() ||
(c.Snippet.Language.ToLower().StartsWith(language.ToLower()) && c.Snippet.TrackKind == "asr"));
if (caption == null)
throw new Exception($"Nenhuma legenda encontrada para o idioma {language}");
var captionDownloadRequest = youtubeService.Captions.Download(caption.Id);
captionDownloadRequest.Tfmt = "vtt"; // Fix: Use string "vtt" instead of non-existent TfmtEnum
var captionStream = await captionDownloadRequest.ExecuteAsStreamAsync();
string vttFilePath = Path.Combine(workingDir, $"{videoId}_{language}.vtt");
using (var fileStream = new FileStream(vttFilePath, FileMode.Create, FileAccess.Write))
{
await captionStream.CopyToAsync(fileStream);
}
return await File.ReadAllTextAsync(vttFilePath);
}
private static string ExtractVideoId(string url)
{
var youtubeIdRegex = new Regex(@"(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^""&?\/\s]{11})");
var match = youtubeIdRegex.Match(url);
return match.Success ? match.Groups[1].Value : null;
}
}
// Classe com a mesma assinatura da original
public record YtDlpInfo(string Title, string ThumbnailUrl);
}

View File

@ -1,8 +1,27 @@
{
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning"
"Serilog": {
"WriteTo": [
{ "Name": "Console" },
{
"Name": "File",
"Args": {
"path": "logs/dev-app-.log",
"rollingInterval": "Day"
}
},
{
"Name": "Seq",
"Args": {
"serverUrl": "http://192.168.0.76:5341",
"compact": true,
"batchPostingLimit": 100
}
}
],
"Properties": {
"Environment": "Development",
"Workspace": "Dev",
"Application": "YTExtractor"
}
}
}

View File

@ -1,10 +1,24 @@
{
"Logging": {
"LogLevel": {
"Serilog": {
"MinimumLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning"
"Override": {
"Microsoft": "Warning",
"System": "Warning"
}
},
"Enrich": [
"FromLogContext",
"WithMachineName",
"WithThreadId",
"WithEnvironmentUserName"
],
"Properties": {
"Workspace": "Dev",
"Application": "YTExtractor"
}
},
"AllowedHosts": "*",
"MongoDbConnaction": "mongodb://admin:c4rn31r0@192.168.0.82:27017,192.168.0.81:27017/?replicaSet=rs0"
"MongoDbConnection": "mongodb://localhost:27017"
//"MongoDbConnaction": "mongodb://admin:c4rn31r0@192.168.0.82:27017,192.168.0.81:27017/?replicaSet=rs0"
}