319 lines
13 KiB
C#
319 lines
13 KiB
C#
using System.IO;
|
|
using System.Text;
|
|
using Microsoft.AspNetCore.Mvc;
|
|
using Microsoft.Extensions.Localization;
|
|
using iText.Kernel.Pdf;
|
|
using iText.Kernel.Pdf.Canvas.Parser;
|
|
using iText.Kernel.Pdf.Canvas.Parser.Listener;
|
|
using iText.Kernel.Exceptions;
|
|
|
|
namespace Convert_It_Online.Areas.DocumentConverters.Controllers
|
|
{
|
|
[Area("DocumentConverters")]
|
|
public class PdfToTextController : Controller
|
|
{
|
|
private readonly IStringLocalizer<SharedResource> _localizer;
|
|
private readonly ILogger<PdfToTextController> _logger;
|
|
private const long MaxPreviewSize = 10 * 1024 * 1024; // 10MB
|
|
|
|
public PdfToTextController(IStringLocalizer<SharedResource> localizer, ILogger<PdfToTextController> logger)
|
|
{
|
|
_localizer = localizer;
|
|
_logger = logger;
|
|
}
|
|
|
|
private void SetCommonViewBagProperties()
|
|
{
|
|
ViewBag.HomeLink = _localizer["HomeLink"];
|
|
ViewBag.TextMenuTitle = _localizer["TextMenuTitle"];
|
|
ViewBag.ImageMenuTitle = _localizer["ImageMenuTitle"];
|
|
ViewBag.DocumentMenuTitle = _localizer["DocumentMenuTitle"];
|
|
ViewBag.CaseConverterTitle = _localizer["CaseConverterTitle"];
|
|
ViewBag.JpgToWebpTitle = _localizer["JpgToWebpTitle"];
|
|
ViewBag.HeicToJpgTitle = _localizer["HeicToJpgTitle"];
|
|
ViewBag.PdfToTextTitle = _localizer["PdfToTextTitle"];
|
|
ViewBag.PdfBarcodeTitle = _localizer["PdfBarcodeTitle"];
|
|
ViewBag.FooterText = _localizer["FooterText"];
|
|
ViewBag.About = _localizer["About"];
|
|
ViewBag.Contact = _localizer["Contact"];
|
|
ViewBag.Terms = _localizer["Terms"];
|
|
}
|
|
|
|
private void PrepareIndexView()
|
|
{
|
|
SetCommonViewBagProperties();
|
|
ViewBag.PageTitle = _localizer["PdfTextConverterPageTitle"];
|
|
ViewBag.PageDescription = _localizer["PdfTextConverterPageDescription"];
|
|
ViewBag.PdfPlainTextTabTitle = _localizer["PdfPlainTextTabTitle"];
|
|
ViewBag.PdfMarkdownTabTitle = _localizer["PdfMarkdownTabTitle"];
|
|
ViewBag.PdfFileInputLabel = _localizer["PdfFileInputLabel"];
|
|
ViewBag.PdfPasswordLabel = _localizer["PdfPasswordLabel"];
|
|
ViewBag.PdfPasswordPlaceholder = _localizer["PdfPasswordPlaceholder"];
|
|
ViewBag.PdfPasswordHint = _localizer["PdfPasswordHint"];
|
|
ViewBag.ExtractPlainTextButton = _localizer["ExtractPlainTextButton"];
|
|
ViewBag.ExtractMarkdownButton = _localizer["ExtractMarkdownButton"];
|
|
ViewBag.DownloadPlainTextButton = _localizer["DownloadPlainTextButton"];
|
|
ViewBag.DownloadMarkdownButton = _localizer["DownloadMarkdownButton"];
|
|
ViewBag.PdfTextPreviewTitle = _localizer["PdfTextPreviewTitle"];
|
|
ViewBag.SelectFileError = _localizer["SelectFileError"];
|
|
|
|
ViewBag.FaqWhatTitle = _localizer["PdfTextFaqWhatTitle"];
|
|
ViewBag.FaqWhatContent = _localizer["PdfTextFaqWhatContent"];
|
|
ViewBag.FaqHowTitle = _localizer["PdfTextFaqHowTitle"];
|
|
ViewBag.FaqHowContent = _localizer["PdfTextFaqHowContent"];
|
|
ViewBag.FaqWhyTitle = _localizer["PdfTextFaqWhyTitle"];
|
|
ViewBag.FaqWhyContent = _localizer["PdfTextFaqWhyContent"];
|
|
ViewBag.FaqSecurityTitle = _localizer["PdfTextFaqSecurityTitle"];
|
|
ViewBag.FaqSecurityContent = _localizer["PdfTextFaqSecurityContent"];
|
|
ViewBag.FaqLimitsTitle = _localizer["PdfTextFaqLimitsTitle"];
|
|
ViewBag.FaqLimitsContent = _localizer["PdfTextFaqLimitsContent"];
|
|
|
|
ViewBag.MetaDescription = ViewBag.PageDescription;
|
|
}
|
|
|
|
public IActionResult Index()
|
|
{
|
|
PrepareIndexView();
|
|
return View();
|
|
}
|
|
|
|
[HttpGet]
|
|
public IActionResult Test()
|
|
{
|
|
return Json(new { success = true, message = "Roteamento funcionando!", timestamp = DateTime.Now });
|
|
}
|
|
|
|
[HttpPost]
|
|
public async Task<IActionResult> ExtractPlainText(IFormFile pdfFile, string? password, bool preview = false)
|
|
{
|
|
return await HandleExtraction(pdfFile, password, preview, toMarkdown: false);
|
|
}
|
|
|
|
[HttpPost]
|
|
public async Task<IActionResult> ExtractMarkdown(IFormFile pdfFile, string? password, bool preview = false)
|
|
{
|
|
return await HandleExtraction(pdfFile, password, preview, toMarkdown: true);
|
|
}
|
|
|
|
private async Task<IActionResult> HandleExtraction(IFormFile? pdfFile, string? password, bool preview, bool toMarkdown)
|
|
{
|
|
if (pdfFile == null || pdfFile.Length == 0)
|
|
{
|
|
_logger.LogWarning("[PDF-TEXT] Attempt without file");
|
|
if (preview)
|
|
{
|
|
return Json(new { success = false, message = _localizer["SelectFileError"].Value });
|
|
}
|
|
|
|
ModelState.AddModelError("pdfFile", _localizer["SelectFileError"]);
|
|
PrepareIndexView();
|
|
return View("Index");
|
|
}
|
|
|
|
if (!IsValidPdf(pdfFile))
|
|
{
|
|
_logger.LogWarning("[PDF-TEXT] Invalid file type: {ContentType}", pdfFile.ContentType);
|
|
if (preview)
|
|
{
|
|
return Json(new { success = false, message = _localizer["InvalidPdfFileError"].Value });
|
|
}
|
|
|
|
ModelState.AddModelError("pdfFile", _localizer["InvalidPdfFileError"]);
|
|
PrepareIndexView();
|
|
return View("Index");
|
|
}
|
|
|
|
if (preview && pdfFile.Length > MaxPreviewSize)
|
|
{
|
|
if (preview)
|
|
{
|
|
return Json(new { success = false, message = _localizer["PdfPreviewTooLarge"].Value });
|
|
}
|
|
}
|
|
|
|
var extraction = await TryExtractTextAsync(pdfFile, password);
|
|
if (!extraction.Success)
|
|
{
|
|
var message = _localizer[extraction.ErrorKey].Value;
|
|
if (preview)
|
|
{
|
|
return Json(new { success = false, message });
|
|
}
|
|
|
|
ModelState.AddModelError("pdfFile", _localizer[extraction.ErrorKey]);
|
|
ViewBag.ConversionError = message;
|
|
PrepareIndexView();
|
|
return View("Index");
|
|
}
|
|
|
|
var textContent = extraction.Content ?? string.Empty;
|
|
if (toMarkdown)
|
|
{
|
|
textContent = ToMarkdown(textContent);
|
|
}
|
|
|
|
var fileBaseName = Path.GetFileNameWithoutExtension(pdfFile.FileName);
|
|
var extension = toMarkdown ? ".md" : ".txt";
|
|
var downloadFileName = string.IsNullOrWhiteSpace(fileBaseName) ? (toMarkdown ? "resultado.md" : "resultado.txt") : fileBaseName + extension;
|
|
var contentType = toMarkdown ? "text/markdown" : "text/plain";
|
|
|
|
if (preview)
|
|
{
|
|
return Json(new
|
|
{
|
|
success = true,
|
|
content = textContent,
|
|
filename = downloadFileName,
|
|
format = toMarkdown ? "markdown" : "text"
|
|
});
|
|
}
|
|
|
|
var payload = Encoding.UTF8.GetBytes(textContent);
|
|
return File(payload, contentType, downloadFileName);
|
|
}
|
|
|
|
private async Task<(bool Success, string? Content, string ErrorKey)> TryExtractTextAsync(IFormFile pdfFile, string? password)
|
|
{
|
|
try
|
|
{
|
|
await using var memoryStream = new MemoryStream();
|
|
await pdfFile.CopyToAsync(memoryStream);
|
|
var pdfBytes = memoryStream.ToArray();
|
|
|
|
// Try multiple password encodings and extraction strategies
|
|
var passwords = new List<byte[]?> { null }; // Start with no password
|
|
|
|
if (!string.IsNullOrEmpty(password))
|
|
{
|
|
passwords.Add(System.Text.Encoding.UTF8.GetBytes(password));
|
|
passwords.Add(System.Text.Encoding.ASCII.GetBytes(password));
|
|
passwords.Add(System.Text.Encoding.Latin1.GetBytes(password));
|
|
}
|
|
|
|
foreach (var pwd in passwords)
|
|
{
|
|
try
|
|
{
|
|
ReaderProperties readerProperties = new ReaderProperties();
|
|
if (pwd != null)
|
|
{
|
|
readerProperties.SetPassword(pwd);
|
|
}
|
|
|
|
using var pdfReader = new PdfReader(new MemoryStream(pdfBytes), readerProperties);
|
|
using var pdfDocument = new PdfDocument(pdfReader);
|
|
|
|
var builder = new StringBuilder();
|
|
int numberOfPages = pdfDocument.GetNumberOfPages();
|
|
|
|
for (int i = 1; i <= numberOfPages; i++)
|
|
{
|
|
var page = pdfDocument.GetPage(i);
|
|
|
|
// Try multiple extraction strategies
|
|
var strategies = new ITextExtractionStrategy[]
|
|
{
|
|
new LocationTextExtractionStrategy(),
|
|
new SimpleTextExtractionStrategy()
|
|
};
|
|
|
|
string pageText = "";
|
|
foreach (var strategy in strategies)
|
|
{
|
|
try
|
|
{
|
|
pageText = PdfTextExtractor.GetTextFromPage(page, strategy);
|
|
if (!string.IsNullOrWhiteSpace(pageText))
|
|
break;
|
|
}
|
|
catch
|
|
{
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(pageText))
|
|
{
|
|
builder.AppendLine(pageText);
|
|
builder.AppendLine();
|
|
}
|
|
else
|
|
{
|
|
builder.AppendLine($"[Página {i} - texto não detectado ou pode conter apenas imagens]");
|
|
builder.AppendLine();
|
|
}
|
|
}
|
|
|
|
var result = builder.ToString().Trim();
|
|
if (!string.IsNullOrWhiteSpace(result))
|
|
{
|
|
return (true, result, string.Empty);
|
|
}
|
|
}
|
|
catch (BadPasswordException) when (pwd != null)
|
|
{
|
|
continue; // Try next password encoding
|
|
}
|
|
catch (BadPasswordException) when (pwd == null && !string.IsNullOrEmpty(password))
|
|
{
|
|
// Document requires password but none worked
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If we get here, either password was wrong or no text found
|
|
if (!string.IsNullOrEmpty(password))
|
|
{
|
|
return (false, null, "PdfInvalidPassword");
|
|
}
|
|
|
|
return (false, null, "InvalidPdfFileError");
|
|
}
|
|
catch (BadPasswordException)
|
|
{
|
|
return (false, null, string.IsNullOrEmpty(password) ? "PdfPasswordRequired" : "PdfInvalidPassword");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "[PDF-TEXT] Failed to extract text from {FileName}", pdfFile.FileName);
|
|
return (false, null, "InvalidPdfFileError");
|
|
}
|
|
}
|
|
|
|
private static bool IsValidPdf(IFormFile file)
|
|
{
|
|
if (file == null)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
var contentType = file.ContentType?.ToLowerInvariant();
|
|
if (contentType == "application/pdf" || contentType == "application/x-pdf")
|
|
{
|
|
return true;
|
|
}
|
|
|
|
return Path.GetExtension(file.FileName).Equals(".pdf", StringComparison.OrdinalIgnoreCase);
|
|
}
|
|
|
|
private static string ToMarkdown(string text)
|
|
{
|
|
var normalized = text.Replace("\r\n", "\n").Trim();
|
|
if (string.IsNullOrWhiteSpace(normalized))
|
|
{
|
|
return string.Empty;
|
|
}
|
|
|
|
var builder = new StringBuilder();
|
|
var lines = normalized.Split('\n');
|
|
foreach (var line in lines)
|
|
{
|
|
var trimmed = line.TrimEnd();
|
|
builder.AppendLine(trimmed);
|
|
}
|
|
|
|
return builder.ToString().Trim();
|
|
}
|
|
}
|
|
}
|