Stirling-Tools · tanseer123 · Jul 29, 2024 · Frooodle · Jul 31, 2024 · Frooodle
@@ -18,7 +18,6 @@
 import org.springframework.web.bind.annotation.RestController;
 import org.springframework.web.multipart.MultipartFile;
 
-import io.github.pixee.security.Filenames;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
 
@@ -33,19 +32,83 @@ public class AutoRenameController {
     private static final Logger logger = LoggerFactory.getLogger(AutoRenameController.class);
 
     private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f;
-    private static final int LINE_LIMIT = 11;
+    private static final int DEFAULT_LINE_LIMIT = 11;
 
     @PostMapping(consumes = "multipart/form-data", value = "/auto-rename")
     @Operation(
-            summary = "Extract header from PDF file",
+            summary = "Extract header from PDF file or Auto rename ",
             description =
-                    "This endpoint accepts a PDF file and attempts to extract its title or header based on heuristics. Input:PDF Output:PDF Type:SISO")
+                    "This endpoint accepts a PDF file and attempts to rename it based on various methods. Based on keyword or else extract its title or header based on heuristics. Input:PDF Output:PDF Type:SISO")
     public ResponseEntity<byte[]> extractHeader(@ModelAttribute ExtractHeaderRequest request)
-            throws Exception {
+            throws IOException {
         MultipartFile file = request.getFileInput();
         Boolean useFirstTextAsFallback = request.isUseFirstTextAsFallback();
 
+        String keyword = request.getKeyword();
+        Boolean useAfter = request.getUseAfter();
+        Integer linesToCheck =
+                request.getLinesToCheck() != null ? request.getLinesToCheck() : DEFAULT_LINE_LIMIT;
+
         PDDocument document = Loader.loadPDF(file.getBytes());
+        boolean check = keyword != null && !keyword.isEmpty();
+
+        String newFileName;
+        if (keyword != null && !keyword.isEmpty()) {
+            newFileName = getTextByKeyword(document, keyword, useAfter, linesToCheck);
+            if ("Untitled".equals(newFileName)) {
+                newFileName =
+                        extractHeaderUsingFontSize(document, useFirstTextAsFallback, linesToCheck);
+            }
+        } else {
+            newFileName =
+                    extractHeaderUsingFontSize(document, useFirstTextAsFallback, linesToCheck);
+        }
+        newFileName = sanitizeFileName(newFileName) + ".pdf";
+        return WebResponseUtils.pdfDocToWebResponse(document, newFileName);
+    }
+
+    private String getTextByKeyword(
+            PDDocument document, String keyword, Boolean useAfter, int linesToCheck)
+            throws IOException {
+        PDFTextStripper stripper = new PDFTextStripper();
+        stripper.setStartPage(1);
+        stripper.setEndPage(1);
+        String text = stripper.getText(document);
+
+        String[] lines = text.split("\n");
+        keyword = keyword.toLowerCase().trim();
+        for (int i = 0; i < Math.min(linesToCheck, lines.length); i++) {
+            String line = lines[i].trim();
+            String lineLower = line.toLowerCase();
+            if (lineLower.contains(keyword)) {
+                if (useAfter) {
+                    int index = lineLower.indexOf(keyword) + keyword.length();
+                    String afterKeyword = line.substring(index).trim();
+                    if (afterKeyword.isEmpty() || afterKeyword.equals(".")) {
+                        if (i + 1 < lines.length) {
+                            afterKeyword = lines[i + 1].trim();
+                        }
+                    }
+                    if (afterKeyword.isEmpty() || afterKeyword.equals(".")) {
+                        return "Untitled";
+                    } else {
+                        return afterKeyword;
+                    }
+                } else {
+                    if (i + 1 < lines.length && !lines[i + 1].toLowerCase().contains(keyword)) {
+                        String result = (line + " " + lines[i + 1].trim()).trim();
+                        return result;
+                    }
+                    return line;
+                }
+            }
+        }
+        return "Untitled";
+    }
+
+    private String extractHeaderUsingFontSize(
+            PDDocument document, Boolean useFirstTextAsFallback, int linesToCheck)
+            throws IOException {
         PDFTextStripper reader =
                 new PDFTextStripper() {
                     class LineInfo {
@@ -66,13 +129,13 @@ class LineInfo {
 
                     @Override
                     protected void processTextPosition(TextPosition text) {
-                        if (lastY != text.getY() && lineCount < LINE_LIMIT) {
+                        if (lastY != text.getY() && lineCount < linesToCheck) {
                             processLine();
                             lineBuilder = new StringBuilder(text.getUnicode());
                             maxFontSizeInLine = text.getFontSizeInPt();
                             lastY = text.getY();
                             lineCount++;
-                        } else if (lineCount < LINE_LIMIT) {
+                        } else if (lineCount < linesToCheck) {
                             lineBuilder.append(text.getUnicode());
                             if (text.getFontSizeInPt() > maxFontSizeInLine) {
                                 maxFontSizeInLine = text.getFontSizeInPt();
@@ -81,7 +144,7 @@ protected void processTextPosition(TextPosition text) {
                     }
 
                     private void processLine() {
-                        if (lineBuilder.length() > 0 && lineCount < LINE_LIMIT) {
+                        if (lineBuilder.length() > 0 && lineCount < linesToCheck) {
                             lineInfos.add(new LineInfo(lineBuilder.toString(), maxFontSizeInLine));
                         }
                     }
@@ -125,17 +188,19 @@ public String getText(PDDocument doc) throws IOException {
                                         : null);
                     }
                 };
+        reader.setEndPage(1);
 
         String header = reader.getText(document);
 
-        // Sanitize the header string by removing characters not allowed in a filename.
         if (header != null && header.length() < 255) {
-            header = header.replaceAll("[/\\\\?%*:|\"<>]", "").trim();
-            return WebResponseUtils.pdfDocToWebResponse(document, header + ".pdf");
+            return header.trim();
         } else {
             logger.info("File has no good title to be found");
-            return WebResponseUtils.pdfDocToWebResponse(
-                    document, Filenames.toSimpleFileName(file.getOriginalFilename()));
+            return "Untitled";
         }
     }
+
+    private String sanitizeFileName(String fileName) {
+        return fileName.replaceAll("[/\\\\?%*:|\"<>]", "").trim();
+    }
 }
@@ -1,5 +1,7 @@
 package stirling.software.SPDF.model.api.misc;
 
+import org.springframework.web.multipart.MultipartFile;
+
 import io.swagger.v3.oas.annotations.media.Schema;
 
 import lombok.Data;
@@ -16,4 +18,9 @@ public class ExtractHeaderRequest extends PDFFile {
             required = false,
             defaultValue = "false")
     private boolean useFirstTextAsFallback;
+
+    private MultipartFile fileInput;
+    private String keyword;
+    private Boolean useAfter;
+    private Integer linesToCheck;
 }
@@ -20,8 +20,29 @@
               <form method="post" enctype="multipart/form-data" th:action="@{'/api/v1/misc/auto-rename'}">
                 <div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
                 <br>
+                <div class="form-group">
+                  <label for="keyword">Keyword:</label>
+                  <input type="text" class="form-control" id="keyword" name="keyword" placeholder="e.g., Company, Name, Invoice" required>
+                </div>
+                <br>
+                <div class="form-group">
+                  <label for="useAfter">Text to use:</label>
+                  <select class="form-control" id="useAfter" name="useAfter">
+                    <option value="false">Entire line containing keyword</option>
+                    <option value="true">Text after keyword</option>
+                  </select>
+                </div>
+                <br>
+
+                <div class="form-group">
+                  <label for="linesToCheck">Number of Lines to Check:</label>
+                  <input type="number" class="form-control" id="linesToCheck" name="linesToCheck" value="10" min="1">
+                </div>
+                <br>
+
                 <button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{auto-rename.submit}"></button>
               </form>
+
             </div>
           </div>
         </div>