c#抽取pdf文档标题（2）

  public class IETitle

     {

         public static List<WordInfo> WordsInfo = new List<WordInfo>();

         private static string pdfcontent;

         public static HandleResult GetTitle(string path, string realtitle)

         {

             WordsInfo.Clear();

             string content = string.Empty;

             try

             {

                 content = ITextSharpLib.ExtractTextFromPdf(path);

             }

             catch

             {

                 try

                 {

                     content = PDFBoxLib.Pdf2txt(path);

                 }

                 catch (Exception ex)

                 {

                 }

             }

             pdfcontent = content;

             PDFBoxLib.HandleContent(path);

             //处理字符

             Word w = new Word();

             w.MakeWord(WordsInfo);

             Line line = new Line();

             line.MakeLine(w);

             //处理行

             Block block = new Block();

             block.MakeBlock(line);

             //获取全部的文本

             string text = string.Empty;

             try

             {

                 text = ITextSharpLib.ExtractTextFromPdf(path, );

             }

             catch (Exception ex)

             {

                 text = content;

             }

             HandleResult title = new HandleResult() { Title = "" };

             try

             {

                 var sentences = text.Split('\n');

                 InfoExtract ie = new InfoExtract(sentences, text);

                 title = ie.ExtractTitle(block, realtitle);

             }

             catch (Exception ex)

             {

                 Logger.Debug(ex.Message);

             }

             return title;

         }

     }

上面就是获取标题的整体逻辑代码。29行，是调用pdfboxLib，读取pdf第一页内容：

  public static string HandleContent(string fileName, int pageIndex = )

         {

             try

             {

                 PDDocument document = null;

                 try

                 {

                     document = PDDocument.load(fileName);

                     List allPages = document.getDocumentCatalog().getAllPages();

                     int size = pageIndex ==  ? allPages.size() : ;

                     for (int i = ; i < size; i++)

                     {

                         var page = (PDPage)allPages.get(i);

                         var contents = page.getContents();

                         PrintTextLocatins2 printer = new PrintTextLocatins2();

                         if (contents != null)

                         {

                             printer.processStream(page, page.findResources(), page.getContents().getStream());

                         }

                     }

                 }

                 catch (Exception ex)

                 {

                 }

                 finally

                 {

                     if (document != null)

                     {

                         document.close();

                     }

                 }

             }

             catch (Exception ex)

             {

             }

             return "";

         }

第23行 printer.processStream方法，会触发自定义类PrintTextLocation2类中的字符处理方法 processTextPosition：

  public class PrintTextLocatins2 : PDFTextStripper

     {

         private static int BOLD_F_NUM = ;

         private static String[] BOLD_FLAGS = { "Bold", "CAJ FNT04" };

         private static int ITALIC_F_NUM = ;

         private static String[] ITALIC_FLAGS = { "Italic", "CAJ FNT03" };

         private static bool IsBold(String font)

         {

             int i;

             for (i = ; i < BOLD_F_NUM; i++)

                 if (font.Contains(BOLD_FLAGS[i]))

                     return true;

             return false;

         }

         private static bool IsItalic(String font)

         {

             int i;

             for (i = ; i < ITALIC_F_NUM; i++)

                 if (font.Contains(ITALIC_FLAGS[i]))

                     return true;

             return false;

         }

         public PrintTextLocatins2()

         {

             base.setSortByPosition(false);

         }

         protected override void processTextPosition(TextPosition text)

         {

             WordInfo info = new WordInfo()

             {

                 X = text.getX(),

                 Y = text.getY(),

                 XDirAdj = text.getXDirAdj(),

                 YDirAdj = text.getYDirAdj(),

                 FontSize = text.getFontSize(),

                 Xscale = text.getXScale(),

                 Yscale = text.getYScale(),

                 Height = text.getHeight(),

                 Space = text.getWidthOfSpace(),

                 Width = text.getWidth(),

                 Subfont = text.getFont().getSubType(),

                 Basefont = text.getFont().getBaseFont(),

                 IsBold = IsBold(text.getFont().getBaseFont()),

                 IsItalic = IsItalic(text.getFont().getBaseFont()),

                 XSize = (int)(text.getFontSize() * text.getXScale()),

                 YSize = (int)(text.getFontSize() * text.getYScale()),

                 Word = text.getCharacter()

             };

             if (info.Space.ToString() == "非数字")

             {

                 info.Space = ;

             }

             IETitle.WordsInfo.Add(info);

         }

     }

这样我们就利用pdfbox收集了pdf文档的字符信息。

巴特西

c#抽取pdf文档标题（2）

最新文章

热门文章