New method for readers (listWorksheetInfo)

Topics: Developer Forum, User Forum
Jun 2, 2011 at 7:46 PM
Edited Jan 3, 2012 at 10:35 PM

I've added the method listWorksheetInfo to the readers (for my own use) that returns the array:

array[<Sheet Index>]['worksheetName']

array[<Sheet Index>]['lastColumnLetter']

array[<Sheet Index>]['lastColumnIndex']

array[<Sheet Index>]['totalRows']

array[<Sheet Index>]['totalColumns']

PHP Version: 5.2.x / PHPExcel Version: 1.7.6 / Windows XP & Debian

This methods gives me the following:

  • Number of worksheets (count(array))
  • Worksheet Index
  • Worksheet Name
  • Last Column Letter
  • Last Column Index
  • Total Rows
  • Total Columns

All this information is returned without actually loading the spreadsheet into memory.

(Note: I created the listWorksheetInfo method by what I could glean from the existing code so there could be a better method to do the same thing)

This allows me to write my read filter code as efficiently as possible.

Customization code (use at your own risk):

 

//
// Reader CSV.php custom method listWorksheetInfo()
//

/**
 * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
 *
 * @param   string     $pFilename
 * @throws   Exception
 */
public function listWorksheetInfo($pFilename)
{
  // Check if file exists
  if (!file_exists($pFilename)) {
    throw new Exception("Could not open " . $pFilename . " for reading! File does not exist.");
  }

  // Open file
  $fileHandle = fopen($pFilename, 'r');
  if ($fileHandle === false) {
    throw new Exception("Could not open file " . $pFilename . " for reading.");
  }

  // Skip BOM, if any
  switch ($this->_inputEncoding) {
    case 'UTF-8':
      fgets($fileHandle, 4) == "\xEF\xBB\xBF" ?
        fseek($fileHandle, 3) : fseek($fileHandle, 0);
      break;
    case 'UTF-16LE':
      fgets($fileHandle, 3) == "\xFF\xFE" ?
        fseek($fileHandle, 2) : fseek($fileHandle, 0);
      break;
    case 'UTF-16BE':
      fgets($fileHandle, 3) == "\xFE\xFF" ?
        fseek($fileHandle, 2) : fseek($fileHandle, 0);
      break;
    case 'UTF-32LE':
      fgets($fileHandle, 5) == "\xFF\xFE\x00\x00" ?
        fseek($fileHandle, 4) : fseek($fileHandle, 0);
      break;
    case 'UTF-32BE':
      fgets($fileHandle, 5) == "\x00\x00\xFE\xFF" ?
        fseek($fileHandle, 4) : fseek($fileHandle, 0);
      break;
    default:
      break;
  }

  $escapeEnclosures = array( "\\" . $this->_enclosure, $this->_enclosure . $this->_enclosure );

  $worksheetInfo = array();
  $worksheetInfo[0]['worksheetName'] = 'Worksheet';
  $worksheetInfo[0]['lastColumnLetter'] = 'A';
  $worksheetInfo[0]['lastColumnIndex'] = 0;
  $worksheetInfo[0]['totalRows'] = 0;
  $worksheetInfo[0]['totalColumns'] = 0;

  // Loop through each line of the file in turn
  while (($rowData = fgetcsv($fileHandle, 0, $this->_delimiter, $this->_enclosure)) !== FALSE) {
    $worksheetInfo[0]['totalRows']++;
    $worksheetInfo[0]['lastColumnIndex'] = max($worksheetInfo[0]['lastColumnIndex'], count($rowData) - 1);
  }

  $worksheetInfo[0]['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($worksheetInfo[0]['lastColumnIndex']);
  $worksheetInfo[0]['totalColumns'] = $worksheetInfo[0]['lastColumnIndex'] + 1;

  // Close file
  fclose($fileHandle);

  return $worksheetInfo;
}

//
// Reader Excel5.php custom method listWorksheetInfo()
//

/**
 * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
 *
 * @param   string     $pFilename
 * @throws   Exception
 */
public function listWorksheetInfo($pFilename)
{
  // Check if file exists
  if (!file_exists($pFilename)) {
    throw new Exception("Could not open " . $pFilename . " for reading! File does not exist.");
  }

  $worksheetInfo = array();

  // Read the OLE file
  $this->_loadOLE($pFilename);

  // total byte size of Excel data (workbook global substream + sheet substreams)
  $this->_dataSize = strlen($this->_data);

  // initialize
  $this->_pos            = 0;
  $this->_sheets        = array();

  // Parse Workbook Global Substream
  while ($this->_pos < $this->_dataSize) {
    $code = self::_GetInt2d($this->_data, $this->_pos);

    switch ($code) {
      case self::XLS_Type_BOF:        $this->_readBof();        break;
      case self::XLS_Type_SHEET:      $this->_readSheet();      break;
      case self::XLS_Type_EOF:        $this->_readDefault();    break 2;
      default:                        $this->_readDefault();    break;
    }
  }

  // Parse the individual sheets
  foreach ($this->_sheets as $sheet) {

    if ($sheet['sheetType'] != 0x00) {
      // 0x00: Worksheet
      // 0x02: Chart
      // 0x06: Visual Basic module
      continue;
    }

    $tmpInfo = array();
    $tmpInfo['worksheetName'] = $sheet['name'];
    $tmpInfo['lastColumnLetter'] = 'A';
    $tmpInfo['lastColumnIndex'] = 0;
    $tmpInfo['totalRows'] = 0;
    $tmpInfo['totalColumns'] = 0;

    $this->_pos = $sheet['offset'];

    while ($this->_pos <= $this->_dataSize - 4) {
      $code = self::_GetInt2d($this->_data, $this->_pos);

      switch ($code) {
        case self::XLS_Type_RK:
        case self::XLS_Type_LABELSST:
        case self::XLS_Type_NUMBER:
        case self::XLS_Type_FORMULA:
        case self::XLS_Type_BOOLERR:
        case self::XLS_Type_LABEL:
          $length = self::_GetInt2d($this->_data, $this->_pos + 2);
          $recordData = substr($this->_data, $this->_pos + 4, $length);

          // move stream pointer to next record
          $this->_pos += 4 + $length;

          $rowIndex = self::_GetInt2d($recordData, 0) + 1;
          $columnIndex = self::_GetInt2d($recordData, 2);

          $tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex);
          $tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex);
          break;
        case self::XLS_Type_BOF:      $this->_readBof();          break;
        case self::XLS_Type_EOF:      $this->_readDefault();      break 2;
        default:                      $this->_readDefault();      break;
      }
    }

    $tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
    $tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;

    $worksheetInfo[] = $tmpInfo;
  }

  return $worksheetInfo;
}

//
// Reader Excel2003XML.php custom method listWorksheetInfo()
//

/**
 * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
 *
 * @param   string     $pFilename
 * @throws   Exception
 */
public function listWorksheetInfo($pFilename)
{
  // Check if file exists
  if (!file_exists($pFilename)) {
    throw new Exception("Could not open " . $pFilename . " for reading! File does not exist.");
  }

  $worksheetInfo = array();

  $xml = simplexml_load_file($pFilename);
  $namespaces = $xml->getNamespaces(true);

  $worksheetID = 1;
  $xml_ss = $xml->children($namespaces['ss']);
  foreach($xml_ss->Worksheet as $worksheet) {
    $worksheet_ss = $worksheet->attributes($namespaces['ss']);

    $tmpInfo = array();
    $tmpInfo['worksheetName'] = '';
    $tmpInfo['lastColumnLetter'] = 'A';
    $tmpInfo['lastColumnIndex'] = 0;
    $tmpInfo['totalRows'] = 0;
    $tmpInfo['totalColumns'] = 0;

    if (isset($worksheet_ss['Name'])) {
      $tmpInfo['worksheetName'] = (string) $worksheet_ss['Name'];
    } else {
      $tmpInfo['worksheetName'] = "Worksheet_{$worksheetID}";
    }

    if (isset($worksheet->Table->Row)) {
      $rowIndex = 0;

      foreach($worksheet->Table->Row as $rowData) {
        $columnIndex = 0;
        $rowHasData = false;

        foreach($rowData->Cell as $cell) {
          if (isset($cell->Data)) {
            $tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex);
            $rowHasData = true;
          }

          ++$columnIndex;
        }

        ++$rowIndex;

        if ($rowHasData) {
          $tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex);
        }
      }
    }

    $tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
    $tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;

    $worksheetInfo[] = $tmpInfo;
    ++$worksheetID;
  }

  return $worksheetInfo;
}

//
// Reader Excel2007.php custom method listWorksheetInfo()
//

/**
 * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
 *
 * @param   string     $pFilename
 * @throws   Exception
 */
public function listWorksheetInfo($pFilename)
{
  // Check if file exists
  if (!file_exists($pFilename)) {
    throw new Exception("Could not open " . $pFilename . " for reading! File does not exist.");
  }

  $worksheetInfo = array();

  $zip = new ZipArchive;
  $zip->open($pFilename);

  $rels = simplexml_load_string($this->_getFromZipArchive($zip, "_rels/.rels")); //~ http://schemas.openxmlformats.org/package/2006/relationships");
  foreach ($rels->Relationship as $rel) {
    if ($rel["Type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument") {
      $dir = dirname($rel["Target"]);
      $relsWorkbook = simplexml_load_string($this->_getFromZipArchive($zip, "$dir/_rels/" . basename($rel["Target"]) . ".rels"));  //~ http://schemas.openxmlformats.org/package/2006/relationships");
      $relsWorkbook->registerXPathNamespace("rel", "http://schemas.openxmlformats.org/package/2006/relationships");

      $worksheets = array();
      foreach ($relsWorkbook->Relationship as $ele) {
        if ($ele["Type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet") {
          $worksheets[(string) $ele["Id"]] = $ele["Target"];
        }
      }

      $xmlWorkbook = simplexml_load_string($this->_getFromZipArchive($zip, "{$rel['Target']}"));  //~ http://schemas.openxmlformats.org/spreadsheetml/2006/main");
      if ($xmlWorkbook->sheets) {
        $dir = dirname($rel["Target"]);
        foreach ($xmlWorkbook->sheets->sheet as $eleSheet) {
          $tmpInfo = array();
          $tmpInfo['worksheetName'] = (string) $eleSheet["name"];
          $tmpInfo['lastColumnLetter'] = 'A';
          $tmpInfo['lastColumnIndex'] = 0;
          $tmpInfo['totalRows'] = 0;
          $tmpInfo['totalColumns'] = 0;

          $fileWorksheet = $worksheets[(string) self::array_item($eleSheet->attributes("http://schemas.openxmlformats.org/officeDocument/2006/relationships"), "id")];
          $xmlSheet = simplexml_load_string($this->_getFromZipArchive($zip, "$dir/$fileWorksheet"));  //~ http://schemas.openxmlformats.org/spreadsheetml/2006/main");
          if ($xmlSheet && $xmlSheet->sheetData && $xmlSheet->sheetData->row) {
            foreach ($xmlSheet->sheetData->row as $row) {
              foreach ($row->c as $c) {
                $r = (string) $c["r"];
                $coordinates = PHPExcel_Cell::coordinateFromString($r);

                $rowIndex = $coordinates[1];
                $columnIndex = PHPExcel_Cell::columnIndexFromString($coordinates[0]) - 1;

                $tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex);
                $tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex);
              }
            }
          }

          $tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
          $tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;

          $worksheetInfo[] = $tmpInfo;
        }
      }
    }
  }

  $zip->close();

  return $worksheetInfo;
}

//
// Reader Gnumeric.php custom method listWorksheetInfo()
//

/**
 * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
 *
 * @param   string     $pFilename
 * @throws   Exception
 */
public function listWorksheetInfo($pFilename)
{
  // Check if file exists
  if (!file_exists($pFilename)) {
    throw new Exception("Could not open " . $pFilename . " for reading! File does not exist.");
  }

  $gFileData = $this->_gzfileGetContents($pFilename);

  $xml = simplexml_load_string($gFileData);
  $namespacesMeta = $xml->getNamespaces(true);

  $gnmXML = $xml->children($namespacesMeta['gnm']);

  $worksheetInfo = array();

  foreach ($gnmXML->Sheets->Sheet as $sheet) {
    $tmpInfo = array();
    $tmpInfo['worksheetName'] = (string) $sheet->Name;
    $tmpInfo['lastColumnLetter'] = 'A';
    $tmpInfo['lastColumnIndex'] = 0;
    $tmpInfo['totalRows'] = 0;
    $tmpInfo['totalColumns'] = 0;

    foreach ($sheet->Cells->Cell as $cell) {
      $cellAttributes = $cell->attributes();

      $rowIndex = (int) $cellAttributes->Row + 1;
      $columnIndex = (int) $cellAttributes->Col;

      $tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex);
      $tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex);
    }

    $tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
    $tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;

    $worksheetInfo[] = $tmpInfo;
  }

  return $worksheetInfo;
}

//
// Reader OOCalc.php custom method listWorksheetInfo()
//

/**
 * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
 *
 * @param   string     $pFilename
 * @throws   Exception
 */
public function listWorksheetInfo($pFilename)
{
  // Check if file exists
  if (!file_exists($pFilename)) {
    throw new Exception("Could not open " . $pFilename . " for reading! File does not exist.");
  }

  $worksheetInfo = array();

  $zip = new ZipArchive;
  if ($zip->open($pFilename) === true) {

    $xml = simplexml_load_string($zip->getFromName("content.xml"));
    $namespacesContent = $xml->getNamespaces(true);

    $workbook = $xml->children($namespacesContent['office']);
    foreach($workbook->body->spreadsheet as $workbookData) {
      $workbookData = $workbookData->children($namespacesContent['table']);
      foreach($workbookData->table as $worksheetDataSet) {
        $worksheetData = $worksheetDataSet->children($namespacesContent['table']);
        $worksheetDataAttributes = $worksheetDataSet->attributes($namespacesContent['table']);

        $tmpInfo = array();
        $tmpInfo['worksheetName'] = $worksheetDataAttributes['name'];
        $tmpInfo['lastColumnLetter'] = 'A';
        $tmpInfo['lastColumnIndex'] = 0;
        $tmpInfo['totalRows'] = 0;
        $tmpInfo['totalColumns'] = 0;

        $rowIndex = 0;
        foreach ($worksheetData as $key => $rowData) {
          $rowHasData = false;

          switch ($key) {
            case 'table-row' :
              $columnIndex = 0;

              foreach ($rowData as $key => $cellData) {
                $cellHasData = false;

                $cellDataText = $cellData->children($namespacesContent['text']);
                $cellDataOfficeAttributes = $cellData->attributes($namespacesContent['office']);

                if (isset($cellDataText->p)) {
                  switch ($cellDataOfficeAttributes['value-type']) {
                    case 'string' :
                    case 'boolean' :
                    case 'float' :
                    case 'date' :
                    case 'time' :
                        $cellHasData = true;
                        break;
                  }
                }

                $cellDataText = null;
                $cellDataOfficeAttributes = null;

                if ($cellHasData) {
                  $tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex);
                  $rowHasData = true;
                }

                ++$columnIndex;
              }

              ++$rowIndex;

              if ($rowHasData) {
                $tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex);
              }
              break;
          }
        }

        $tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
        $tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;

        $worksheetInfo[] = $tmpInfo;
      }
    }
  }

  return $worksheetInfo;
}

//
// Reader SYLK.php custom method listWorksheetInfo()
//

/**
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
 *
 * @param   string     $pFilename
 * @throws   Exception
 */
public function listWorksheetInfo($pFilename)
{
  // Check if file exists
  if (!file_exists($pFilename)) {
    throw new Exception("Could not open " . $pFilename . " for reading! File does not exist.");
  }

  // Open file
  $fileHandle = fopen($pFilename, 'r');
  if ($fileHandle === false) {
    throw new Exception("Could not open file " . $pFilename . " for reading.");
  }

$worksheetInfo = array();
$worksheetInfo[0]['worksheetName'] = 'Worksheet';
$worksheetInfo[0]['lastColumnLetter'] = 'A';
$worksheetInfo[0]['lastColumnIndex'] = 0;
$worksheetInfo[0]['totalRows'] = 0;
$worksheetInfo[0]['totalColumns'] = 0;

  // Loop through file
  $rowData = array();

  // loop through one row (line) at a time in the file
  $rowIndex = 0;
  while (($rowData = fgets($fileHandle)) !== FALSE) {
    $columnIndex = 0;

    // convert SYLK encoded $rowData to UTF-8
    $rowData = PHPExcel_Shared_String::SYLKtoUTF8($rowData);

    // explode each row at semicolons while taking into account that literal semicolon (;)
    // is escaped like this (;;)
    $rowData = explode("\t",str_replace('�',';',str_replace(';',"\t",str_replace(';;','�',rtrim($rowData)))));

    $dataType = array_shift($rowData);
    if ($dataType == 'C') {
      //  Read cell value data
      foreach($rowData as $rowDatum) {
        switch($rowDatum{0}) {
          case 'C' :
          case 'X' :
            $columnIndex = substr($rowDatum,1) - 1;
            break;
          case 'R' :
          case 'Y' :
            $rowIndex = substr($rowDatum,1);
            break;
        }

        $worksheetInfo[0]['totalRows'] = max($worksheetInfo[0]['totalRows'], $rowIndex);
        $worksheetInfo[0]['lastColumnIndex'] = max($worksheetInfo[0]['lastColumnIndex'], $columnIndex);
      }
    }
  }

  $worksheetInfo[0]['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($worksheetInfo[0]['lastColumnIndex']);
$worksheetInfo[0]['totalColumns'] = $worksheetInfo[0]['lastColumnIndex'] + 1;

  // Close file
  fclose($fileHandle);

  return $worksheetInfo;
}

 

 


Read Filter and Spreadsheet Reader sample

 

//
// This class overrides the default reader in order to ignore extraneous data.
//
class ReadFilter implements PHPExcel_Reader_IReadFilter
{
  private $_worksheetName;
  private $_startRow;
  private $_endRow;
  private $_columns;
  private $_isChunkRead;
  private $_chunkStartRow;
  private $_chunkSize;

  public function __construct() {
    $this->_worksheetName = 'Worksheet';
    $this->_startRow      = 0;
    $this->_endRow        = 65530;
    $this->_columns       = $this->getColumnRange('A','XFD');
    $this->_isChunkRead   = false;
    $this->_chunkStartRow = 1;
    $this->_chunkSize     = 1000; // Number of rows read in each chunk
  }

  public function setWorksheetName($worksheetName) {
    $this->_worksheetName = (!empty($worksheetName)) ? $worksheetName : 'Worksheet';
  }

  public function setStartRow($startRow) {
    if(($startRow >= 0) && ($startRow <= $this->_endRow)) {
      $this->_startRow = $startRow;
    }
  }

  public function setEndRow($endRow) {
    if($endRow >= $this->_startRow)
    {
      $this->_endRow = $endRow;
    }
  }

  public function setColumns($columns) {
    $this->_columns = (is_array($columns)) ? $columns  : $this->getColumnRange('A','XFD');
  }

  public function setChunkRead() {
    $this->_isChunkRead = true;
  }

  public function setChunkSize($chunkSize = null) {
    if(!is_null($chunkSize)) {
      $this->_chunkSize = $chunkSize;
    }

    $this->_isChunkRead = true;
  }

  public function setChunkStartRow($chunkStartRow = null) {
    if(!is_null($chunkStartRow)) {
      $this->_chunkStartRow = $chunkStartRow;
    }
  }

  public function unsetChunkRead() {
    $this->_isChunkRead = false;
  }

  public function readCell($column, $row, $worksheetName = '') {
    if((!empty($worksheetName)) && ($worksheetName != $this->_worksheetName)) {
      return false;
    }

    if($this->_isChunkRead) {
      //  Read rows in chunks and only the columns specified
      if(($row >= $this->_chunkStartRow) && ($row < ($this->_chunkStartRow + $this->_chunkSize)) && ($row <= $this->_endRow)) {
        if(in_array($column, $this->_columns)) {
          return true;
        }
      }
    } else {
      //  Read all rows but only the columns specified
      if($row >= $this->_startRow && $row <= $this->_endRow) {
        if(in_array($column, $this->_columns)) {
          return true;
        }
      }
    }

    return false;
  }
}

//
// readSpreadsheetData()
//                       $FileName (String): File Name
//
public function readSpreadsheetData($FileName) {
  $this->data = array();

  if (empty($FileName)) {
    throw new Exception("No file specified.");
  }

  if (!file_exists($FileName)) {
    throw new Exception("Could not open " . $FileName . " for reading! File does not exist.");
  }

  try {
    $FileType = PHPExcel_IOFactory::identify($FileName);
    $SpreadsheetReaderObj = PHPExcel_IOFactory::createReader($FileType);
    switch ($FileType) {
      case 'Excel2007':
      case 'Excel2003XML':
      case 'Excel5':
      case 'OOCalc':
      case 'SYLK':
        break;
      case 'CSV':
        $SpreadsheetReaderObj->setDelimiter(',');
        $SpreadsheetReaderObj->setEnclosure('"');
        $SpreadsheetReaderObj->setLineEnding('\r\n');
        $SpreadsheetReaderObj->setInputEncoding('UTF-8');
        break;
    }

    //
    // Get worksheet information.
    //
    $WorksheetListInfo = $SpreadsheetReaderObj->listWorksheetInfo($FileName);

    //
    // Estimate maximum chunk size based on system memory.
    //
    $MemoryLimit = $this->getMemorySizeAsBytes();
    $BufferFactor = 1; 
    $RowMemory = (1024 * 1) * $WorksheetListInfo[0]['totalColumns'];
    $MaxChunkSize = max(intval(($MemoryLimit / $BufferFactor) / $RowMemory), 100);

    $ReadFilter = new ReadFilter();
    $SpreadsheetReaderObj->setReadFilter($ReadFilter);

    foreach ($WorksheetListInfo as $WorksheetIndex => $WorksheetInfo) {
      //
      // Worksheet Id may be used for identifying specific record.
      //
      $WorksheetId = $WorksheetIndex + 1;

      //
      // If chunk size > total rows in sheet reduce chunk size to total rows.
      //
      $ChunkSize = ($MaxChunkSize < $WorksheetInfo['totalRows'])
                 ? $MaxChunkSize
                 : $WorksheetInfo['totalRows'];

      //
      // Configure read filter.
      //
      $ReadFilter->setWorksheetName($WorksheetInfo['worksheetName']);
      $ReadFilter->setColumns($this->getColumnRange('A', $WorksheetInfo['lastColumnLetter']));
      $ReadFilter->setEndRow($WorksheetInfo['totalRows']);
      $ReadFilter->setChunkSize($ChunkSize);

      for ($StartRow = 1; $StartRow <= $WorksheetInfo['totalRows']; $StartRow += $ChunkSize) {
        $ReadFilter->setStartRow($StartRow);

        $SpreadsheetObj = $SpreadsheetReaderObj->load($FileName);
        $SpreadsheetObj->setActiveSheetIndex($WorksheetIndex);

        $WorksheetObj = $SpreadsheetObj->getActiveSheet();
        foreach ($WorksheetObj->getRowIterator() as $RowObj) {
          $RowId = $RowObj->getRowIndex();
          if ($RowId > $ChunkSize) {
            break;
          }

          $SheetRowId = $StartRow + $RowId - 1;
          if ($SheetRowId > $WorksheetInfo['totalRows']) {
            break;
          }

          $RecordId = "{$WorksheetId}-{$SheetRowId}";
          $this->data["{$RecordId}"] = array(); // array() represents column data.
        }
      }
    }
  } catch (Exception $ExceptionObj) {
    //
    // Exception Error Handling
    //
  }
}

//
// Returns array of column letters between ColumnLetter1 and ColumnLetter2 inclusively
//
public function getColumnRange($ColumnLetter1, $ColumnLetter2)
{
  static $ColumnsReference = '';

  if (empty($ColumnsReference)) {
    $ColumnsReference = $this->initializeColumnsReference(3);
  }

  $ColumnRange = array();

  $Length1 = strlen($ColumnLetter1);
  $Length2 = strlen($ColumnLetter2);

  $Pos1 = strpos($ColumnsReference, "|{$ColumnLetter1}|");
  $Pos2 = strpos($ColumnsReference, "|{$ColumnLetter2}|");

  if ($Pos2 >= $Pos1) {
    $StartPos = $Pos1 + 1;
    $EndPos = $Pos2 + $Length2;

    $Length = $EndPos - $StartPos + 1;

    $RangeStr = substr($ColumnsReference, $StartPos, $Length);

    $ColumnRange = explode('|', $RangeStr);
  }

  return $ColumnRange;
}

//
// Generate string that contains column letters for future reference.
// initializeColumnsReference(1) = "|A|B|C|...|X|Y|Z|"
// initializeColumnsReference(2) = "|A|B|C|...|ZX|ZY|ZZ|"
// initializeColumnsReference(3) = "|A|B|C|...|ZZX|ZZY|ZZZ|"
//
private function initializeColumnsReference($Width = 1)
{
  $ColumnsStr = '';
  $ColumnLetter = '';
  $LastColumnLetter = str_repeat('Z', $Width);
  $Tier = '';

  do {
    for ($ChrVal = 65; $ChrVal <= 90; $ChrVal++) {
      $ColumnLetter = $Tier . chr($ChrVal);
      $ColumnsStr .= "|{$ColumnLetter}";
    }

    if (empty($Tier)) {
      $Tier = 'A';
    } else {
      $IsEndTier = false;
      $IsFirst = true;
      $ZCount = 0;
      $NewTier = '';

      foreach (array_reverse(str_split($Tier)) as $TierChr) {
        if (($TierChr == 'Z') && ($IsEndTier || $IsFirst)) {
          $NewTierChr = 'A';
        } elseif ($IsEndTier || $IsFirst) {
          $NewTierChr = chr(ord($TierChr) + 1);
        } else {
          $NewTierChr = $TierChr;
        }

        $IsEndTier = ($TierChr == 'Z') ? true : false;
        $ZCount += ($TierChr == 'Z') ? 1 : 0;

        $NewTier = "{$NewTierChr}{$NewTier}";
        $IsFirst = false;
      }

      $Tier = $NewTier;
      $Tier .= ($ZCount == strlen($Tier)) ?  'A' : '';
    }
  } while ($ColumnLetter != $LastColumnLetter);

  $ColumnsStr .= "|";

  return $ColumnsStr;
}

//
// Convert Memory Size To Bytes
//
private function getMemorySizeAsBytes() {
  $memory_size = ini_get('memory_limit');

  switch (substr($memory_size, -1)) {
    case 'G':
    case 'g':
      return (int) $memory_size * 1073741824;
    case 'M':
    case 'm':
      return (int) $memory_size * 1048576;
    case 'K':
    case 'k':
      return (int) $memory_size * 1024;
  }

  return $memory_size;
}

 

 


Jun 8, 2011 at 2:43 PM

I think this code may be exactly what I need to work with some overly large CSV files (these files used to be XLS but I figured using CSV would probably be less memory intensive)

I'm currently writing some code that reads data from a spreadsheet and inserts this into a database, this all works fine until the spreadsheets get too large

For reference, the original XLS file I'm working with right now was about 85 MB (but it will eventually have to handle files of around 350 MB as well)

It contains 3 worksheets which I saved as CSV files, the first one is rather small, the other two are a little over 20 MB each

Each of the larger CSV files contains about 48,000 rows and 47 columns (larger files will have larger amounts of columns, not rows)

Now the application I'm writing can handle it when I trim the CSV files into their first 10,000 rows, and uses about 300 MB of memory when processing them (both, not each), I've tried to use the approach of loading the files in chunks to conserve memory usage, but it doesn't seem to be helping much

 

Perhaps this code of yours will be exactly what I need, however I can't seem to get it running; I'm afraid I'm still quite new to php and perhaps you could help me by offering some tips on implementing your code

 

The way I understood it is that I basically include all the code you supplied (although I only copied the CSV Reader as it's the only Reader I'd use) and to get it running I call readSpreadsheetData()

I'm probably missing something very simple here so any help you could offer would be very much appreciated

 

-Jeroen

Jun 8, 2011 at 9:15 PM
Edited Dec 13, 2011 at 9:14 PM

@jeroenbakker

Well since this code was derived from my actual working code there are several presumptions that are not mentioned that could cause some hang ups.

1) readSpreadsheetData() is a method defined inside of a class. You need to define that class minimally in order for this method to work.

2) readSpreadsheetData() requires the ReadFilter class to be loaded in order to work.

3) The following line in the readSpreadsheetData() method presumes that PHP server allows the usage of the ini_get() command.

     $MemoryLimit = $this->return_bytes(ini_get('memory_limit'));

 

Since you say you are new to PHP any of the above could easily be overlooked. So here's instructions and fully functional code with just a couple of changes needed.

If you've installed the PHPExcel package somewhere besides the root directory you will need to specify the $ClassesPath.

Copy the following php code and replace <Specify File> to the actual file name you want to load.

This will read the spreadsheet into a simple array[<Record Id>][<Column Data>] and then show this array using the print_r() command;

(Note: I have not actually tested the code below so let me know if you have any issues.)

I've removed the code posted here since I've updated the original post to include it.

Jun 8, 2011 at 9:25 PM

Thanks a lot for responding! (with working code!)

I'm not currently at a computer which has all the files I work with, but I'll give this a go tomorrow, I'm certain the key to getting my code to work is in your post, as I did spot a mistake I made

The mistake being that I did not define readSpreadsheetData() inside of a class

I'm used to working with Java where such a thing would be a given, but I'm new enough to php that I wasn't sure if that was necessary (and I regret to admit that I haven't actually taken the time to get to know php a lot before starting to work with it)

I'm not sure if my php server allows the usage of the ini_get() command, but as I'm running it locally I'll be able to change that if need be

I'll certainly let you know how this worked out, and many thanks again

Jun 8, 2011 at 9:44 PM
Edited Jun 8, 2011 at 9:45 PM

Glad to help.

Also I should note that I made some minor tweaks to the code to make reading the column data a bit easier. And the chunk size formula is deliberately conservative so you might want to experiment in order to get the best performance.

Anyway, now there is a piece of test code that can be used to verify if the spreadsheet is actually being read and will spit out any errors for debugging.

- Christopher Mullins

Jun 9, 2011 at 11:10 PM
Edited Jun 10, 2011 at 8:19 AM

There were actually a few small issues, but I've managed to resolve them all on my own

A few of them (2 or 3) were capitalisation issues, one of them was that there was a distinct lack of the return_bytes() function :P (or maybe I implemented something wrong? In any case I found and put in my own) and... maybe there was something else, but that was the gist of it

I have yet to try it on a large file right now, because it's getting a bit late here and I assume it'll take a while to process

Once I've tried a larger file I'll let you know how it goes

Thanks for all the effort so far!

 

-Jeroen

Edit:

Well, it still ran out of memory (I had allocated 400MB, perhaps I should've gone with more), sometimes it seems reading it in chunks consumes even more memory

As I am working with CSV files of which I know what kind of values it holds (all numbers and a few letters) I have chosen instead to just read and open it as any text file, which is a lot quicker of course

To be honest I am not sure why I stuck with PHPExcel when all I did was read CSV files, I suppose it is because I started out with Excel files

Many thanks for your help and efforts though

Aug 17, 2011 at 5:38 PM

I found your code to be very useful Crhirs, thanks a lot!  One thing I am having problems with is the 'listWorksheetInfo($FileName)' function.  Where should I place this function?  I tried putting it in the IOFiactory.php file but that didn't work.  I also tried creating a class for it and then calling it with an object but that also didn't work.  I am also new to PHP so I am not so I wouldn't be surprised to find I am doing some silly mistake.  Can someone advice?

 

Thanks!

-Rick

Nov 14, 2011 at 8:14 PM

Absolutely great! - I am developing a program that extracts data from manufacturers products-lists, mostly formated in an excel format and had big issues with the command-sequence "select a file -> select a worksheet -> read selected lines of the worksheet" with big files.
With "big files" I mean files with 200 and more MB.
It was horrible that only listing the embedded worksheets took about half an hour to create the selection dropdown list because all data had to be read.

Your code shows that it should "ABSOLUTELY" make it's way into the project. Maybe by defining base classes with your code and deriving the existing classes from them to make it all backwards-compatible.

My big and great THANK YOU for this code and post.

Yours Henri

Nov 17, 2011 at 3:15 PM
Edited Nov 29, 2011 at 10:45 PM

Hi again,

great, I tested the code and expecially the listWorksheetInfo($pFilename) function.

When I tested the function with Excel-files with a lot of columns (Last column GH) i noticed that the array would always only return Z as the last column.

So, for example in the listWorksheetInfo($pFilename) function of the Excel5.php file, a comparison is made :

 

if ($columnString > $tmpInfo['lastColumnLetter']) {
    $tmpInfo['lastColumnLetter'] = $columnString;
}

Because the comparison

 

$columnString > $tmpInfo['lastColumnLetter'] does only work for column-strings with one letters, I added a function...

	/**
	* Return TRUE if the first parameter is "greater" than the second
	*
	* @param   string $compareme
	* @param   string $compareme
	*
	* @return boolean
	*/
	private function compareColumnLetters(&$compareme, &$with) {
		if ( strlen($compareme) == 1 && strlen($with) == 1 && $compareme > $with) {
			return TRUE;
		}
		else if (strlen($compareme) > strlen($with)) {
			return TRUE;
		}

		else if (strlen($compareme) === 2 && strlen($with) === 2) {
			$arrayCompareme = preg_split('//', $compareme, -1, PREG_SPLIT_NO_EMPTY);
			$arrayWith = preg_split('//', $with, -1, PREG_SPLIT_NO_EMPTY);

			$arrayBigger = array();
			for ($i = 0; $i < 2; $i++) {
				if ($arrayCompareme[$i] == $arrayWith[$i]) {
					$arrayBigger[$i] = 'e';
				}
				else if ($arrayCompareme[$i] > $arrayWith[$i]) {
					$arrayBigger[$i] = 'b';
				}
				else {
					$arrayBigger[$i] = 's';
				}
			}

			if ($arrayBigger[0] == 'b') {
				return TRUE;
			}
			else if ($arrayBigger[0] == 'e' && $arrayBigger[1] == 'b') {
				return TRUE;
			}
		}

		return FALSE;
	}
...which has to be added to each reader-override file and replaced the comparison with:
if ($this->compareColumnLetters($columnString, $tmpInfo['lastColumnLetter'])) {
    $tmpInfo['lastColumnLetter'] = $columnString;
}

This at least returns the correct last column-string also for column-strings with two letters.

Great code! This function really almost consumes no memory.
Yours Henri Schomäcker
Nov 23, 2011 at 7:40 PM
Edited Nov 29, 2011 at 10:46 PM

And hello again,

first of all to answer richis question:

Add the listWorksheetInfo($FileName) functions listed by schir1964 to the file named above each function in the Classes/PHPExcel/Reader directory.

If you want to also support spreadsheet files with more columns than from A to Z, read my reply directly above this one.

Because I also need an array of all column-names (A, B, C etc.) I now also added some lines to return the columnLetters in the array provided by listWorksheetInfo:

 

if ($this->compareColumnLetters($columnString, $tmpInfo['lastColumnLetter'])) {
	$tmpInfo['lastColumnLetter'] = $columnString;

	if (array_search($columnString, $tmpInfo['columnLetters']) === FALSE) {
		$tmpInfo['columnLetters'][] = $columnString;
	}
}
 

 

Note that you have to take care of the naming of the variables in every Reader file because their names differ from file to file!

 

And here's a little example how to use the listWorksheetInfo function:

 

if (file_exists($datafile) && is_readable($datafile)) {

	try {
		$FileType = PHPExcel_IOFactory::identify($datafile);

		$SpreadsheetReaderObj = PHPExcel_IOFactory::createReader($FileType);
		switch ($FileType) {
			case 'Excel5':
			case 'Excel2003XML':
			case 'Excel2007':
			case 'OOCalc':
			case 'SYLK':
			break;
			case 'CSV':
			$SpreadsheetReaderObj->setDelimiter(',');
			$SpreadsheetReaderObj->setEnclosure('"');
			$SpreadsheetReaderObj->setLineEnding('\r\n');
			$SpreadsheetReaderObj->setInputEncoding('UTF-8');
			break;
		}

		// Get worksheet information.
		$WorksheetListInfo = $SpreadsheetReaderObj->listWorksheetInfo($datafile);
	}
	catch (Exception $ExceptionObj) {
		echo $ExceptionObj->getMessage();
	}

	print_r($WorksheetListInfo);
}

 

Yours Henri Schomäcker

 

Dec 2, 2011 at 8:24 AM

@byteconcepts

Thanks for the kind words. And for relaying the column issue. The files that I am dealing with might have 8 to 10 columns at most so I would have never have run into the issue.

I'll give your code some thought to see if perhaps there is a more efficient way to do those comparisons. I intend fix the issue with my code even if I may never run into it.

Thanks again Henri!

- Christopher Mullins

Dec 2, 2011 at 9:42 AM
Edited Dec 13, 2011 at 9:24 PM

 

/* Removed obsolete code in favor of MarkBaker's suggestion below */

Here's some code to return the max column letter of two column letters.

- Christopher Mullins

Dec 2, 2011 at 11:50 PM
Edited Dec 13, 2011 at 9:26 PM

I've added the following routine to the CSV.php file and change the listWorksheetInfo method in the initial post to use it for finding the lasColumnLetter.

/* Removed obsolete code in favor of MarkBaker's suggestion below (/

Dec 7, 2011 at 10:49 AM

@schir1964 Sorry, but did you test the code of convertIntToColumnLetter($columnInt) with more than 26 column-letters (A to Z) ?

Instead of AZ it returns B@

Instead of BZ it returns C@

...and so on.

My test-code:

echo "Number of column letters: \n";
$col_letters = convertColumnLetterToInt('GH');
echo $col_letters;

echo "<p>Column letters: <br />\n";
for ($i = 1; $i <= $col_letters; $i++) {
	echo convertIntToColumnLetter($i) . " <br />\n";
}

Coordinator
Dec 7, 2011 at 3:01 PM
byteconcepts wrote:

@schir1964 Sorry, but did you test the code of convertIntToColumnLetter($columnInt) with more than 26 column-letters (A to Z) ?

Use PHPExcel's built in cell address functions for handling this

PHPExcel_Cell::columnIndexFromString()

and

PHPExcel_Cell::stringFromColumnIndex()

then you don't have any expensive looping either

Dec 7, 2011 at 8:12 PM

@MarkBaker

Thanks! I knew there had to be a better way to do this, but I didn't have time to research it thoroughly.

When I get time I'll make the changes for easy reference here.

- Christopher Mullins

Coordinator
Dec 7, 2011 at 11:26 PM

I will look to include this in the 1.7.7 release

Dec 8, 2011 at 6:49 AM
byteconcepts wrote:
...

Because I also need an array of all column-names (A, B, C etc.) I now also added some lines to return the columnLetters in the array provided by listWorksheetInfo:

if ($this->compareColumnLetters($columnString, $tmpInfo['lastColumnLetter'])) {
	$tmpInfo['lastColumnLetter'] = $columnString;

	if (array_search($columnString, $tmpInfo['columnLetters']) === FALSE) {
		$tmpInfo['columnLetters'][] = $columnString;
	}
}

I processed some xls files now using my little extension to the new function which should return the "columnLetters" array, but ATTENTION:

I'm sorry, but it worked in about 95% percent of the processed files but I had to notice that in some files where at the beginning of a worksheet multiple columns in some rows were combined to one, the array columnLetters returned strange and wrong results!

But: lastColumnLetter was always correct!

So I changed my code, throw (in the readers code) away...

if (array_search($columnString, $tmpInfo['columnLetters']) === FALSE) {
	$tmpInfo['columnLetters'][] = $columnString;
}

...and replaced it (in my main code, not in the readers files) with...

foreach ($array_worksheet_list_info AS $ws_idx => $array_ws_metadata) {
	if ( strlen(trim($array_ws_metadata['lastColumnLetter'])) > 0 ) {
		$highest_column_number = PHPExcel_Cell::columnIndexFromString($array_ws_metadata['lastColumnLetter']);

		$array_column_letters = array();
		for ($c = 0; $c < $highest_column_number; $c++) {
			$array_column_letters[] = PHPExcel_Cell::stringFromColumnIndex($c);
		}

		$array_worksheet_list_info[$ws_idx]['columnLetters'] = $array_column_letters;
	}
	else {
		$array_worksheet_list_info[$ws_idx]['columnLetters'] = array();
	}
}

...because I only need it once in my application while the file is uploaded and then it's metadata is explored and saved to DB.

This will always work if lastColumnLetter is correct. - And if you don't like to modify the PHPExcel package you get the array by using the $objPHPExcelWorksheet->getHighestColumn() method.

Many thanks for the hints to PHPExcel_Cell::columnIndexFromString(string) and PHPExcel_Cell::stringFromColumnIndex(int), Mark !!!

Yours Henri

Coordinator
Dec 8, 2011 at 7:48 AM

Possibly another useful hint, to get an array of all column letters between A and $objPHPExcelWorksheet->getHighestColumn(), you can take advantage of PHP's Perl-style character incrementor:

$col = 'A';
$lastCol = $objPHPExcelWorksheet->getHighestColumn();
$columnArray = array($col);
while($col != $lastCol) {
    $columnArray[] = ++$col;
}

Note that this does ignore whether columns are actually used in the worksheet or not, but the column incrementor can be incredibly useful... it's just a pity that the decrementor doesn't work the same way with characters.

Dec 8, 2011 at 8:18 PM

Okay, I've updated my original post with new versions of the methods for each reader using the internal functions for gathering the information.

I'll be testing this code on and off for the next few weeks, but I think the code is pretty solid now.

Thanks again for the help.

- Christopher Mullins

Dec 9, 2011 at 5:42 AM
Edited Dec 12, 2011 at 6:28 PM

@byteconcepts

Here's my attempt at creating a more robust method for returning a range of column letters:

//
// Returns array of column letters between ColumnLetter1 and ColumnLetter2 inclusively
//
public function getColumnRange($ColumnLetter1, $ColumnLetter2)
{
  static $ColumnsReference = '';

  if (empty($ColumnsReference)) {
    $ColumnsReference = $this->initializeColumnsReference(3);
  }

  $ColumnRange = array();

  $Length1 = strlen($ColumnLetter1);
  $Length2 = strlen($ColumnLetter2);

  $Pos1 = strpos($ColumnsReference, "|{$ColumnLetter1}|");
  $Pos2 = strpos($ColumnsReference, "|{$ColumnLetter2}|");

  if ($Pos2 >= $Pos1) {
    $StartPos = $Pos1 + 1;
    $EndPos = $Pos2 + $Length2;

    $Length = $EndPos - $StartPos + 1;

    $RangeStr = substr($ColumnsReference, $StartPos, $Length);

    $ColumnRange = explode('|', $RangeStr);
  }

  return $ColumnRange;
}

//
// Generate string that contains column letters for future reference.
// initializeColumnsReference(1) = "|A|B|C|...|X|Y|Z|"
// initializeColumnsReference(2) = "|A|B|C|...|ZX|ZY|ZZ|"
// initializeColumnsReference(3) = "|A|B|C|...|ZZX|ZZY|ZZZ|"
//
private function initializeColumnsReference($Width = 1)
{
  $ColumnsStr = '';
  $ColumnLetter = '';
  $LastColumnLetter = str_repeat('Z', $Width);
  $Tier = '';

  do {
    for ($ChrVal = 65; $ChrVal <= 90; $ChrVal++) {
      $ColumnLetter = $Tier . chr($ChrVal);
      $ColumnsStr .= "|{$ColumnLetter}";
    }

    if (empty($Tier)) {
      $Tier = 'A';
    } else {
      $IsEndTier = false;
      $IsFirst = true;
      $ZCount = 0;
      $NewTier = '';

      foreach (array_reverse(str_split($Tier)) as $TierChr) {
        if (($TierChr == 'Z') && ($IsEndTier || $IsFirst)) {
          $NewTierChr = 'A';
        } elseif ($IsEndTier || $IsFirst) {
          $NewTierChr = chr(ord($TierChr) + 1);
        } else {
          $NewTierChr = $TierChr;
        }

        $IsEndTier = ($TierChr == 'Z') ? true : false;
        $ZCount += ($TierChr == 'Z') ? 1 : 0;

        $NewTier = "{$NewTierChr}{$NewTier}";
        $IsFirst = false;
      }

      $Tier = $NewTier;
      $Tier .= ($ZCount == strlen($Tier)) ?  'A' : '';
    }
  } while ($ColumnLetter != $LastColumnLetter);

  $ColumnsStr .= "|";

  return $ColumnsStr;
}

The $ColumnsReference variable only gets initialized the first time the method is called.

- Christopher Mullins

Dec 13, 2011 at 9:19 PM
Edited Dec 13, 2011 at 10:17 PM

Just an update.

I've been testing the code shown for the past couple of weeks and what is shown works as intended.

I am now confident that the code shown here is solid.

There may be ways to make this code even more efficient but unfortunately I don't have time to analyze where it could be.

 

- Christopher Mullins