Share » Learn » eZ Publish » Indexing Multiple Binary File Types

Indexing Multiple Binary File Types

Wednesday 20 September 2006 1:35:00 pm

  • Currently 5 out of 5 Stars.
  • 1
  • 2
  • 3
  • 4
  • 5

Rather than call each parsing utility individually, we specified in the configuration file that our custom plugin gets called for every file type. The plugin will then determine which file type is being indexed and call the appropriate parsing utility.

Read this code carefully before you implement it. Note that we are doing things like limiting the number of characters indexed from each file, and also stripping out irregular characters. (We did this to track down a problem we were having with very large files. We think the character limit fixed the issue, but we left the character stripping in there just in case. You may want to remove it and see what kind of results you get.)

  1. Create the file ezbinaryfileparser.php in the directory
    /kernel/classes/datatypes/ezbinaryfile/plugins/.
  2. Place the following code in the php file:
<?php
/*!
\class eZBinaryFileParser ezbinaryfileparser.php
\ingroup eZKernel
\brief The class eZBinaryFileParser handles parsing of Word, Excel, Powerpoint, and PDF files and returns the metadata
*/
class eZBinaryFileParser
{
     function &parseFile( $sFileName )
     {
 
          //The number below is the maximum number of characters that we will
          //allow ezpublish to attempt to index per document
          $iCharacterLimit = 250000;
 
          // save the buffer contents
          $sBuffer =& ob_get_contents();
 
          ob_end_clean();
          ob_start();
          $sExtension = strtolower(substr($sFileName,-3,3));
 
          if(file_exists($sFileName))
          {
 
               $this->customLog("filename: " . $sFileName . "\n");
 
               switch($sExtension):
                    case "pdf":
                         $sCommand = "pdftotext -nopgbrk  -enc UTF-8 " . $sFileName . " -";
                    break;
                    case "doc":
                         $sCommand = "catdoc " . $sFileName . "";
                    break;
                    case "xls":
                         $sCommand = "xls2csv -c -q0 " . $sFileName . "";
                    break;
                    case "ppt":
                         $sCommand = "catppt " . $sFileName . "";
                    break;
                    default:
                         $this->customLog("Invalid File Type\n\n");
                         return false;
               endswitch;
 
               $aSpec = array(
                    0 => array("pipe", "r"),  // stdin is a pipe that the child will read from
                    1 => array("pipe", "w"),  // stdout is a pipe that the child will write to
                    2 => array("pipe", "w")   // stderr is a pipe that the child will write to.
               );
 
               $pHandle = proc_open($sCommand, $aSpec, $aPipes);
 
               while (!feof($aPipes[1]) )
               {
                    $sData .= fread($aPipes[1], 8192);
               }
               while (!feof($aPipes[2]) )
               {
                    $sError .= fread($aPipes[2], 8192);
               }
 
               if($sError)
               {
                    $this->customLog( $sError );
               }
 
               $bReturn = fclose($aPipes[1]);
               $bReturn = fclose($aPipes[2]);
 
               $iExitCode = proc_close($pHandle);
 
               $sData = preg_replace("([^A-Za-z\d\n])", " ", $sData);
 
               if($sExtension != "pdf")
               {
                    $sData = utf8_encode($sData);
               } 
 
               //Trim Data down to acceptable size.
               $sData = substr($sData, 0, $iCharacterLimit);
 
          } //if file exists
          else
          {
               $this->customLog("$sFileName was missing...\n");
               $sData = "";
          }
 
          ob_end_clean();
 
          // fill the buffer with the old values
          ob_start();
          print($sBuffer);
          return $sData;
 
     } //end method parseFile()
 
     function customLog($sData)
     {
          $oBinaryINI =& eZINI::instance( 'binaryfile.ini' );
          $sLogFile = $oBinaryINI->variable( 'BinaryFileHandlerSettings', 'LogFile' );
 
          $sData = date("m/d/Y [H:i] ") . " " . $sData;
 
          // Let's make sure the file exists and is writable first.
          if (is_writable($sLogFile))
          {
 
               // In our example we're opening $filename in append mode.
               // The file pointer is at the bottom of the file hence
               // that's where $somecontent will go when we fwrite() it.
               if (!$pHandle = fopen($sLogFile, 'a'))
               {
                    fwrite(STDERR,"Cannot open file ($sLogFile)");
                    return false;
               }
 
               // Write data to our opened file.
               if (fwrite($pHandle, $sData) === FALSE)
               {
                    fwrite(STDERR,"Cannot write to file ($sLogFile)");
                    return false;
               }
 
               fclose($pHandle);
               return true;
 
          }
          else
          {
               fwrite(STDERR,"The file $sLogFile is not writable");
               return false;
          } //end is_writable
     } //end method customLog()
} //end class eZBinaryFileParser
?>
36 542 Users on board!

Tutorial menu

Printable

Printer Friendly version of the full article on one page with plain styles

Author(s)