public class IETitle
{
public static List<WordInfo> WordsInfo = new List<WordInfo>(); private static string pdfcontent;
public static HandleResult GetTitle(string path, string realtitle)
{
WordsInfo.Clear(); string content = string.Empty;
try
{
content = ITextSharpLib.ExtractTextFromPdf(path);
}
catch
{
try
{
content = PDFBoxLib.Pdf2txt(path);
}
catch (Exception ex)
{ }
} pdfcontent = content; PDFBoxLib.HandleContent(path); //处理字符 Word w = new Word();
w.MakeWord(WordsInfo); Line line = new Line();
line.MakeLine(w); //处理行
Block block = new Block();
block.MakeBlock(line); //获取全部的文本
string text = string.Empty; try
{
text = ITextSharpLib.ExtractTextFromPdf(path, );
}
catch (Exception ex)
{
text = content;
} HandleResult title = new HandleResult() { Title = "" }; try
{
var sentences = text.Split('\n'); InfoExtract ie = new InfoExtract(sentences, text); title = ie.ExtractTitle(block, realtitle); }
catch (Exception ex)
{
Logger.Debug(ex.Message);
} return title;
}
}

上面就是获取标题的整体逻辑代码。29行,是调用pdfboxLib,读取pdf第一页内容:

  public static string HandleContent(string fileName, int pageIndex = )
{
try
{
PDDocument document = null;
try
{
document = PDDocument.load(fileName);
List allPages = document.getDocumentCatalog().getAllPages(); int size = pageIndex == ? allPages.size() : ; for (int i = ; i < size; i++)
{
var page = (PDPage)allPages.get(i); var contents = page.getContents(); PrintTextLocatins2 printer = new PrintTextLocatins2(); if (contents != null)
{
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
}
}
catch (Exception ex)
{
}
finally
{
if (document != null)
{
document.close();
}
}
}
catch (Exception ex)
{ }
return "";
}

第23行 printer.processStream方法,会触发自定义类PrintTextLocation2类中的字符处理方法 processTextPosition:

  public class PrintTextLocatins2 : PDFTextStripper
{
private static int BOLD_F_NUM = ;
private static String[] BOLD_FLAGS = { "Bold", "CAJ FNT04" };
private static int ITALIC_F_NUM = ;
private static String[] ITALIC_FLAGS = { "Italic", "CAJ FNT03" }; private static bool IsBold(String font)
{
int i;
for (i = ; i < BOLD_F_NUM; i++)
if (font.Contains(BOLD_FLAGS[i]))
return true;
return false;
} private static bool IsItalic(String font)
{
int i;
for (i = ; i < ITALIC_F_NUM; i++)
if (font.Contains(ITALIC_FLAGS[i]))
return true;
return false;
} public PrintTextLocatins2()
{
base.setSortByPosition(false);
}
protected override void processTextPosition(TextPosition text)
{ WordInfo info = new WordInfo()
{
X = text.getX(),
Y = text.getY(),
XDirAdj = text.getXDirAdj(),
YDirAdj = text.getYDirAdj(),
FontSize = text.getFontSize(),
Xscale = text.getXScale(),
Yscale = text.getYScale(),
Height = text.getHeight(),
Space = text.getWidthOfSpace(),
Width = text.getWidth(), Subfont = text.getFont().getSubType(),
Basefont = text.getFont().getBaseFont(),
IsBold = IsBold(text.getFont().getBaseFont()),
IsItalic = IsItalic(text.getFont().getBaseFont()), XSize = (int)(text.getFontSize() * text.getXScale()), YSize = (int)(text.getFontSize() * text.getYScale()), Word = text.getCharacter()
}; if (info.Space.ToString() == "非数字")
{
info.Space = ;
} IETitle.WordsInfo.Add(info);
}
}

这样我们就利用pdfbox收集了pdf文档的字符信息。

最新文章

  1. 【WCF】错误协定声明
  2. 超级小的web手势库AlloyFinger发布
  3. QHash
  4. 关于《rsyslog+mysql+loganalyzer搭建日志服务器&lt;个人笔记&gt;》的反思
  5. PHP之MVC项目实战(三)
  6. centos55_oracle11gr2_install
  7. GAE初探-一鼻子灰
  8. 使用passenger在Centos7部署Puma+Nginx+Ruby on Rails
  9. C# 枚举
  10. js实现睡眠
  11. Flash CS 自定义组件
  12. HTTP/1.1与HTTP/1.0的区别[转]
  13. python读取/创建XML文件
  14. ionic 实现微信朋友圈分享的完整开发流程
  15. eclipse编辑器栏上的路径怎么去掉
  16. js 实时数据显示
  17. 浏览器根对象window之performance
  18. html超链接返回上一页面
  19. 【Leetcode】【Medium】Find Peak Element
  20. EASYUI DATAGRID加合计

热门文章

  1. [SCOI2009][bzoj1025]游戏
  2. CENTOS6.6 下mysql MHA架构搭建
  3. js分页功能实现
  4. 机器学习03:K近邻算法
  5. Yii的数组助手类
  6. 机器学习策略——DeepLearning.AI课程总结
  7. nyoj940 A dp problem 打表
  8. 嵌入式Linux引导过程之1.5——从BootRom到Xloader
  9. Linux常用软件
  10. setsockopt()用法(参数详细说明) recv、send的超时处理