使用itextpdf提取pdf内容
2024-08-30 21:03:26
package test; import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern; import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.util.PDFTextStripper; import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor; public class UploadUtils { private final static Pattern pattern = Pattern.compile("\\d+");
private final static int stateParaOverFlag = 800;
private final static int thankParaOverFlag = 800; /**
* 读取pdf参考文献内容
*
* @param s
* @return
*/
public String readPdf(String filePath) {
StringBuilder buffer = new StringBuilder();
FileInputStream fis = null;
PdfReader pdfReader = null;
COSDocument cosDocument = null;
String[] paragraphs = null;
PDFParser p;
boolean addBool = true;
boolean judgeState = false;
boolean judgeThank = false;
StringBuilder tempSb = new StringBuilder();
try {
fis = new FileInputStream(filePath);
p = new PDFParser(fis);
p.parse();
cosDocument = p.getDocument();
// 加密文档判断
if (cosDocument.isEncrypted()) {
StringBuilder tempContent = new StringBuilder();
pdfReader = new PdfReader(filePath);
int i = pdfReader.getNumberOfPages();
for (int j = 1; j <= i; j++) {
tempContent.append(PdfTextExtractor.getTextFromPage(pdfReader, j));
}
paragraphs = tempContent.toString().split("\n");
} else {
PDFTextStripper ts = new PDFTextStripper();
paragraphs = ts.getText(p.getPDDocument()).split("\n");
}
boolean mark = false;
List<Integer> errornum = new ArrayList<Integer>();
int flag = 0;
int endRange = paragraphs.length * 70 / 100;
int rangeFlag = 0;
for (String lineContent : paragraphs) {
if (judgeState) {
tempSb.append(lineContent);
if (tempSb.length() >= stateParaOverFlag) {
judgeState = false;
addBool = true;
tempSb.delete(0, tempSb.length() - 1);
}
}
if (judgeThank) {
tempSb.append(lineContent);
if (tempSb.length() >= thankParaOverFlag) {
judgeThank = false;
addBool = true;
tempSb.delete(0, tempSb.length() - 1);
}
}
if (addBool) {
buffer.append(lineContent);
}
if (mark && rangeFlag >= endRange) {
if (lineContent.length() < 5) {
errornum.add(++flag);
rangeFlag++;
continue;
}
if (pattern.matcher(lineContent.substring(0, 5)).find()) {
if (flag != 0) {
flag = 0;
errornum.clear();
}
} else {
errornum.add(++flag);
}
if (errornum.size() > 2) {
mark = false;
}
}
rangeFlag++;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (fis != null) {
try {
fis.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
fis = null;
}
}
if (pdfReader != null) {
pdfReader.close();
}
if (cosDocument != null) {
try {
cosDocument.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
cosDocument = null;
}
}
}
return buffer.toString();
} public static boolean isBlank(CharSequence cs) {
int strLen;
if (cs == null || (strLen = cs.length()) == 0) {
return true;
}
for (int i = 0; i < strLen; i++) {
if (Character.isWhitespace(cs.charAt(i)) == false) {
return false;
}
}
return true;
} public static void main(String[] args) {
// System.err.println(new UploadUtils()
// .readPdf("/opt/fileCache/2014/125/13/shuangping_D7037870CF4FC5C421A3E5359DCF8BBE.pdf"));
System.err.println(new UploadUtils().readPdf("E:\\MyWork\\guyezhai\\pdf提取\\路径依赖视角下高校新专业建设的策略创新(1).pdf")); } }
其中用到的jar包:
bcpkix-jdk15on-1.47.jar
bcprov-jdk15on-1.49.jar
commons-logging-1.1..jar
fontbox-1.8..jar
icu4j-4.0..jar
itextpdf-5.4..jar
jempbox-1.8..jar
pdfbox-1.8..jar
最新文章
- Oracle Blob数据保存为文件
- flexigrid随手记
- Spring+hibernate+struts
- 将main方法打成jar包,并引用第三方的maven jar包
- JS中exec函数与match函数的区别与联系
- android适应屏幕
- checkbox组件
- WPF - Build Error总结
- ionic3.0--angular4.0 引入第三方插件库的方法
- 帧同步(LockStep)该如何反外挂
- 与其他相似软件对比,win10中个人助理conrtana具备哪些独特的功能
- ceil以及double的精度问题
- linux动态库与静态库混合连接
- html5手机web app <;input type=";file"; >; 只调用图库,禁止调用摄像头?
- 【教程】minicom使用教程
- [SQL Server] 无法连接到本地数据库
- 关于mouseleave事件触发的bug问题
- 第三方开源--Android Image Cropper--图片裁剪
- unix下面是常用命令及简单说明
- vue2.0:(七)、vue-resource