Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance auto-rename functionality with additional options #1604

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;

import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;

Expand All @@ -33,19 +32,83 @@ public class AutoRenameController {
private static final Logger logger = LoggerFactory.getLogger(AutoRenameController.class);

private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f;
private static final int LINE_LIMIT = 11;
private static final int DEFAULT_LINE_LIMIT = 11;

@PostMapping(consumes = "multipart/form-data", value = "/auto-rename")
@Operation(
summary = "Extract header from PDF file",
summary = "Extract header from PDF file or Auto rename ",
description =
"This endpoint accepts a PDF file and attempts to extract its title or header based on heuristics. Input:PDF Output:PDF Type:SISO")
"This endpoint accepts a PDF file and attempts to rename it based on various methods. Based on keyword or else extract its title or header based on heuristics. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> extractHeader(@ModelAttribute ExtractHeaderRequest request)
throws Exception {
throws IOException {
MultipartFile file = request.getFileInput();
Boolean useFirstTextAsFallback = request.isUseFirstTextAsFallback();

String keyword = request.getKeyword();
Boolean useAfter = request.getUseAfter();
Integer linesToCheck =
request.getLinesToCheck() != null ? request.getLinesToCheck() : DEFAULT_LINE_LIMIT;

PDDocument document = Loader.loadPDF(file.getBytes());
boolean check = keyword != null && !keyword.isEmpty();

String newFileName;
if (keyword != null && !keyword.isEmpty()) {
newFileName = getTextByKeyword(document, keyword, useAfter, linesToCheck);
if ("Untitled".equals(newFileName)) {
newFileName =
extractHeaderUsingFontSize(document, useFirstTextAsFallback, linesToCheck);
}
} else {
newFileName =
extractHeaderUsingFontSize(document, useFirstTextAsFallback, linesToCheck);
}
newFileName = sanitizeFileName(newFileName) + ".pdf";
return WebResponseUtils.pdfDocToWebResponse(document, newFileName);
}

private String getTextByKeyword(
PDDocument document, String keyword, Boolean useAfter, int linesToCheck)
throws IOException {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage(1);
stripper.setEndPage(1);
String text = stripper.getText(document);

String[] lines = text.split("\n");
keyword = keyword.toLowerCase().trim();
for (int i = 0; i < Math.min(linesToCheck, lines.length); i++) {
String line = lines[i].trim();
String lineLower = line.toLowerCase();
if (lineLower.contains(keyword)) {
if (useAfter) {
int index = lineLower.indexOf(keyword) + keyword.length();
String afterKeyword = line.substring(index).trim();
if (afterKeyword.isEmpty() || afterKeyword.equals(".")) {
if (i + 1 < lines.length) {
afterKeyword = lines[i + 1].trim();
}
}
if (afterKeyword.isEmpty() || afterKeyword.equals(".")) {
return "Untitled";
} else {
return afterKeyword;
}
} else {
if (i + 1 < lines.length && !lines[i + 1].toLowerCase().contains(keyword)) {
String result = (line + " " + lines[i + 1].trim()).trim();
return result;
}
return line;
}
}
}
return "Untitled";
}

private String extractHeaderUsingFontSize(
PDDocument document, Boolean useFirstTextAsFallback, int linesToCheck)
throws IOException {
PDFTextStripper reader =
new PDFTextStripper() {
class LineInfo {
Expand All @@ -66,13 +129,13 @@ class LineInfo {

@Override
protected void processTextPosition(TextPosition text) {
if (lastY != text.getY() && lineCount < LINE_LIMIT) {
if (lastY != text.getY() && lineCount < linesToCheck) {
processLine();
lineBuilder = new StringBuilder(text.getUnicode());
maxFontSizeInLine = text.getFontSizeInPt();
lastY = text.getY();
lineCount++;
} else if (lineCount < LINE_LIMIT) {
} else if (lineCount < linesToCheck) {
lineBuilder.append(text.getUnicode());
if (text.getFontSizeInPt() > maxFontSizeInLine) {
maxFontSizeInLine = text.getFontSizeInPt();
Expand All @@ -81,7 +144,7 @@ protected void processTextPosition(TextPosition text) {
}

private void processLine() {
if (lineBuilder.length() > 0 && lineCount < LINE_LIMIT) {
if (lineBuilder.length() > 0 && lineCount < linesToCheck) {
lineInfos.add(new LineInfo(lineBuilder.toString(), maxFontSizeInLine));
}
}
Expand Down Expand Up @@ -125,17 +188,19 @@ public String getText(PDDocument doc) throws IOException {
: null);
}
};
reader.setEndPage(1);

String header = reader.getText(document);

// Sanitize the header string by removing characters not allowed in a filename.
if (header != null && header.length() < 255) {
header = header.replaceAll("[/\\\\?%*:|\"<>]", "").trim();
return WebResponseUtils.pdfDocToWebResponse(document, header + ".pdf");
return header.trim();
} else {
logger.info("File has no good title to be found");
return WebResponseUtils.pdfDocToWebResponse(
document, Filenames.toSimpleFileName(file.getOriginalFilename()));
return "Untitled";
}
}

private String sanitizeFileName(String fileName) {
return fileName.replaceAll("[/\\\\?%*:|\"<>]", "").trim();
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package stirling.software.SPDF.model.api.misc;

import org.springframework.web.multipart.MultipartFile;

import io.swagger.v3.oas.annotations.media.Schema;

import lombok.Data;
Expand All @@ -16,4 +18,9 @@ public class ExtractHeaderRequest extends PDFFile {
required = false,
defaultValue = "false")
private boolean useFirstTextAsFallback;

private MultipartFile fileInput;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this multiFile input is already part of this class... its from PDFFile that's extended

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fileInput already included via extended PDFFile

private String keyword;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any API documentation for these?

private Boolean useAfter;
private Integer linesToCheck;
}
21 changes: 21 additions & 0 deletions src/main/resources/templates/misc/auto-rename.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,29 @@
<form method="post" enctype="multipart/form-data" th:action="@{'/api/v1/misc/auto-rename'}">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
<br>
<div class="form-group">
<label for="keyword">Keyword:</label>
<input type="text" class="form-control" id="keyword" name="keyword" placeholder="e.g., Company, Name, Invoice" required>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about if people want to auto grab the title without using any keywords like previous functionality, we still need to support that and make it clear to use this is option,
Perhaps a param to change from
Autodetect from title vs using keyword
and have options show based on that

</div>
<br>
<div class="form-group">
<label for="useAfter">Text to use:</label>
<select class="form-control" id="useAfter" name="useAfter">
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to support text before key word, and or regex as well

<option value="false">Entire line containing keyword</option>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all text should use translation feature

ie
the:text="#{auto-rename.useAfter.false}"

<option value="true">Text after keyword</option>
</select>
</div>
<br>

<div class="form-group">
<label for="linesToCheck">Number of Lines to Check:</label>
<input type="number" class="form-control" id="linesToCheck" name="linesToCheck" value="10" min="1">
</div>
<br>

<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{auto-rename.submit}"></button>
</form>

</div>
</div>
</div>
Expand Down
Loading