package test;

 import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern; import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.util.PDFTextStripper; import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor; public class UploadUtils { private final static Pattern pattern = Pattern.compile("\\d+");
private final static int stateParaOverFlag = 800;
private final static int thankParaOverFlag = 800; /**
* 读取pdf参考文献内容
*
* @param s
* @return
*/
public String readPdf(String filePath) {
StringBuilder buffer = new StringBuilder();
FileInputStream fis = null;
PdfReader pdfReader = null;
COSDocument cosDocument = null;
String[] paragraphs = null;
PDFParser p;
boolean addBool = true;
boolean judgeState = false;
boolean judgeThank = false;
StringBuilder tempSb = new StringBuilder();
try {
fis = new FileInputStream(filePath);
p = new PDFParser(fis);
p.parse();
cosDocument = p.getDocument();
// 加密文档判断
if (cosDocument.isEncrypted()) {
StringBuilder tempContent = new StringBuilder();
pdfReader = new PdfReader(filePath);
int i = pdfReader.getNumberOfPages();
for (int j = 1; j <= i; j++) {
tempContent.append(PdfTextExtractor.getTextFromPage(pdfReader, j));
}
paragraphs = tempContent.toString().split("\n");
} else {
PDFTextStripper ts = new PDFTextStripper();
paragraphs = ts.getText(p.getPDDocument()).split("\n");
}
boolean mark = false;
List<Integer> errornum = new ArrayList<Integer>();
int flag = 0;
int endRange = paragraphs.length * 70 / 100;
int rangeFlag = 0;
for (String lineContent : paragraphs) {
if (judgeState) {
tempSb.append(lineContent);
if (tempSb.length() >= stateParaOverFlag) {
judgeState = false;
addBool = true;
tempSb.delete(0, tempSb.length() - 1);
}
}
if (judgeThank) {
tempSb.append(lineContent);
if (tempSb.length() >= thankParaOverFlag) {
judgeThank = false;
addBool = true;
tempSb.delete(0, tempSb.length() - 1);
}
}
if (addBool) {
buffer.append(lineContent);
}
if (mark && rangeFlag >= endRange) {
if (lineContent.length() < 5) {
errornum.add(++flag);
rangeFlag++;
continue;
}
if (pattern.matcher(lineContent.substring(0, 5)).find()) {
if (flag != 0) {
flag = 0;
errornum.clear();
}
} else {
errornum.add(++flag);
}
if (errornum.size() > 2) {
mark = false;
}
}
rangeFlag++;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (fis != null) {
try {
fis.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
fis = null;
}
}
if (pdfReader != null) {
pdfReader.close();
}
if (cosDocument != null) {
try {
cosDocument.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
cosDocument = null;
}
}
}
return buffer.toString();
} public static boolean isBlank(CharSequence cs) {
int strLen;
if (cs == null || (strLen = cs.length()) == 0) {
return true;
}
for (int i = 0; i < strLen; i++) {
if (Character.isWhitespace(cs.charAt(i)) == false) {
return false;
}
}
return true;
} public static void main(String[] args) {
// System.err.println(new UploadUtils()
// .readPdf("/opt/fileCache/2014/125/13/shuangping_D7037870CF4FC5C421A3E5359DCF8BBE.pdf"));
System.err.println(new UploadUtils().readPdf("E:\\MyWork\\guyezhai\\pdf提取\\路径依赖视角下高校新专业建设的策略创新(1).pdf")); } }

其中用到的jar包:

bcpkix-jdk15on-1.47.jar
bcprov-jdk15on-1.49.jar
commons-logging-1.1..jar
fontbox-1.8..jar
icu4j-4.0..jar
itextpdf-5.4..jar
jempbox-1.8..jar
pdfbox-1.8..jar

最新文章

  1. Oracle Blob数据保存为文件
  2. flexigrid随手记
  3. Spring+hibernate+struts
  4. 将main方法打成jar包,并引用第三方的maven jar包
  5. JS中exec函数与match函数的区别与联系
  6. android适应屏幕
  7. checkbox组件
  8. WPF - Build Error总结
  9. ionic3.0--angular4.0 引入第三方插件库的方法
  10. 帧同步(LockStep)该如何反外挂
  11. 与其他相似软件对比,win10中个人助理conrtana具备哪些独特的功能
  12. ceil以及double的精度问题
  13. linux动态库与静态库混合连接
  14. html5手机web app &lt;input type=&quot;file&quot; &gt; 只调用图库,禁止调用摄像头?
  15. 【教程】minicom使用教程
  16. [SQL Server] 无法连接到本地数据库
  17. 关于mouseleave事件触发的bug问题
  18. 第三方开源--Android Image Cropper--图片裁剪
  19. unix下面是常用命令及简单说明
  20. vue2.0:(七)、vue-resource

热门文章

  1. 2018软工实践—Alpha冲刺(8)
  2. 让程序运行更加面向用户——电梯V2.1
  3. lintcode-129-重哈希
  4. 1106C程序语法树
  5. 遍历frame中的表单:
  6. PHP中与类有关的运算符
  7. 当对象使用sort时候 前提是实现compareTo的方法
  8. iOS--开发从入门到精通
  9. C++解析-外传篇(3):动态内存申请的结果
  10. HTTP摘要认证原理以及HttpClient4.3实现