ParserCSV.inc

  1. nittany7 modules/contrib/feeds/libraries/ParserCSV.inc
  2. cis7 modules/contrib/feeds/libraries/ParserCSV.inc
  3. mooc7 modules/contrib/feeds/libraries/ParserCSV.inc

Contains CSV Parser.

Functions in this file are independent of the Feeds specific implementation. Thanks to jpetso http://drupal.org/user/56020 for most of the code in this file.

Classes

Namesort descending Description
ParserCSV Functionality to parse CSV files into a two dimensional array.
ParserCSVIterator Text lines from file iterator.

File

modules/contrib/feeds/libraries/ParserCSV.inc
View source
  1. <?php
  2. /**
  3. * @file
  4. * Contains CSV Parser.
  5. *
  6. * Functions in this file are independent of the Feeds specific implementation.
  7. * Thanks to jpetso http://drupal.org/user/56020 for most of the code in this
  8. * file.
  9. */
  10. /**
  11. * Text lines from file iterator.
  12. */
  13. class ParserCSVIterator implements Iterator {
  14. private $handle;
  15. private $currentLine;
  16. private $currentPos;
  17. public function __construct($filepath) {
  18. $this->handle = fopen($filepath, 'r');
  19. $this->currentLine = NULL;
  20. $this->currentPos = NULL;
  21. }
  22. function __destruct() {
  23. if ($this->handle) {
  24. fclose($this->handle);
  25. }
  26. }
  27. public function rewind($pos = 0) {
  28. if ($this->handle) {
  29. fseek($this->handle, $pos);
  30. $this->next();
  31. }
  32. }
  33. public function next() {
  34. if ($this->handle) {
  35. $this->currentLine = feof($this->handle) ? NULL : fgets($this->handle);
  36. $this->currentPos = ftell($this->handle);
  37. return $this->currentLine;
  38. }
  39. }
  40. public function valid() {
  41. return isset($this->currentLine);
  42. }
  43. public function current() {
  44. return $this->currentLine;
  45. }
  46. public function currentPos() {
  47. return $this->currentPos;
  48. }
  49. public function key() {
  50. return 'line';
  51. }
  52. }
  53. /**
  54. * Functionality to parse CSV files into a two dimensional array.
  55. */
  56. class ParserCSV {
  57. private $delimiter;
  58. private $skipFirstLine;
  59. private $columnNames;
  60. private $timeout;
  61. private $timeoutReached;
  62. private $startByte;
  63. private $lineLimit;
  64. private $lastLinePos;
  65. public function __construct() {
  66. $this->delimiter = ',';
  67. $this->skipFirstLine = FALSE;
  68. $this->columnNames = FALSE;
  69. $this->timeout = FALSE;
  70. $this->timeoutReached = FALSE;
  71. $this->startByte = 0;
  72. $this->lineLimit = 0;
  73. $this->lastLinePos = 0;
  74. ini_set('auto_detect_line_endings', TRUE);
  75. }
  76. /**
  77. * Set the column delimiter string.
  78. * By default, the comma (',') is used as delimiter.
  79. */
  80. public function setDelimiter($delimiter) {
  81. $this->delimiter = $delimiter;
  82. }
  83. /**
  84. * Set this to TRUE if the parser should skip the first line of the CSV text,
  85. * which might be desired if the first line contains the column names.
  86. * By default, this is set to FALSE and the first line is not skipped.
  87. */
  88. public function setSkipFirstLine($skipFirstLine) {
  89. $this->skipFirstLine = $skipFirstLine;
  90. }
  91. /**
  92. * Specify an array of column names if you know them in advance, or FALSE
  93. * (which is the default) to unset any prior column names. If no column names
  94. * are set, the parser will put each row into a simple numerically indexed
  95. * array. If column names are given, the parser will create arrays with
  96. * these column names as array keys instead.
  97. */
  98. public function setColumnNames($columnNames) {
  99. $this->columnNames = $columnNames;
  100. }
  101. /**
  102. * Define the time (in milliseconds) after which the parser stops parsing,
  103. * even if it has not yet finished processing the CSV data. If the timeout
  104. * has been reached before parsing is done, the parse() method will return
  105. * an incomplete list of rows - a single row will never be cut off in the
  106. * middle, though. By default, no timeout (@p $timeout == FALSE) is defined.
  107. *
  108. * You can check if the timeout has been reached by calling the
  109. * timeoutReached() method after parse() has been called.
  110. */
  111. public function setTimeout($timeout) {
  112. $this->timeout = $timeout;
  113. }
  114. /**
  115. * After calling the parse() method, determine if the timeout (set by the
  116. * setTimeout() method) has been reached.
  117. *
  118. * @deprecated Use lastLinePos() instead to determine whether a file has
  119. * finished parsing.
  120. */
  121. public function timeoutReached() {
  122. return $this->timeoutReached;
  123. }
  124. /**
  125. * Define the number of lines to parse in one parsing operation.
  126. *
  127. * By default, all lines of a file are being parsed.
  128. */
  129. public function setLineLimit($lines) {
  130. $this->lineLimit = $lines;
  131. }
  132. /**
  133. * Get the byte number where the parser left off after last parse() call.
  134. *
  135. * @return
  136. * 0 if all lines or no line has been parsed, the byte position of where a
  137. * timeout or the line limit has been reached otherwise. This position can be
  138. * used to set the start byte for the next iteration after parse() has
  139. * reached the timeout set with setTimeout() or the line limit set with
  140. * setLineLimit().
  141. *
  142. * @see ParserCSV::setStartByte()
  143. */
  144. public function lastLinePos() {
  145. return $this->lastLinePos;
  146. }
  147. /**
  148. * Set the byte where file should be started to read.
  149. *
  150. * Useful when parsing a file in batches.
  151. */
  152. public function setStartByte($start) {
  153. return $this->startByte = $start;
  154. }
  155. /**
  156. * Parse CSV files into a two dimensional array.
  157. *
  158. * @param Iterator $lineIterator
  159. * An Iterator object that yields line strings, e.g. ParserCSVIterator.
  160. * @param $start
  161. * The byte number from where to start parsing the file.
  162. * @param $lines
  163. * The number of lines to parse, 0 for all lines.
  164. * @return
  165. * Two dimensional array that contains the data in the CSV file.
  166. */
  167. public function parse(Iterator $lineIterator) {
  168. $skipLine = $this->skipFirstLine;
  169. $rows = array();
  170. $this->timeoutReached = FALSE;
  171. $this->lastLinePos = 0;
  172. $maxTime = empty($this->timeout) ? FALSE : (microtime() + $this->timeout);
  173. $linesParsed = 0;
  174. for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) {
  175. // Make really sure we've got lines without trailing newlines.
  176. $line = trim($lineIterator->current(), "\r\n");
  177. // Skip empty lines.
  178. if (empty($line)) {
  179. continue;
  180. }
  181. // If the first line contains column names, skip it.
  182. if ($skipLine) {
  183. $skipLine = FALSE;
  184. continue;
  185. }
  186. // The actual parser. explode() is unfortunately not suitable because the
  187. // delimiter might be located inside a quoted field, and that would break
  188. // the field and/or require additional effort to re-join the fields.
  189. $quoted = FALSE;
  190. $currentIndex = 0;
  191. $currentField = '';
  192. $fields = array();
  193. // We must use strlen() as we're parsing byte by byte using strpos(), so
  194. // drupal_strlen() will not work properly.
  195. while ($currentIndex <= strlen($line)) {
  196. if ($quoted) {
  197. $nextQuoteIndex = strpos($line, '"', $currentIndex);
  198. if ($nextQuoteIndex === FALSE) {
  199. // There's a line break before the quote is closed, so fetch the
  200. // next line and start from there.
  201. $currentField .= substr($line, $currentIndex);
  202. $lineIterator->next();
  203. if (!$lineIterator->valid()) {
  204. // Whoa, an unclosed quote! Well whatever, let's just ignore
  205. // that shortcoming and record it nevertheless.
  206. $fields[] = $currentField;
  207. break;
  208. }
  209. // Ok, so, on with fetching the next line, as mentioned above.
  210. $currentField .= "\n";
  211. $line = trim($lineIterator->current(), "\r\n");
  212. $currentIndex = 0;
  213. continue;
  214. }
  215. // There's actually another quote in this line...
  216. // find out whether it's escaped or not.
  217. $currentField .= substr($line, $currentIndex, $nextQuoteIndex - $currentIndex);
  218. if (isset($line[$nextQuoteIndex + 1]) && $line[$nextQuoteIndex + 1] === '"') {
  219. // Escaped quote, add a single one to the field and proceed quoted.
  220. $currentField .= '"';
  221. $currentIndex = $nextQuoteIndex + 2;
  222. }
  223. else {
  224. // End of the quoted section, close the quote and let the
  225. // $quoted == FALSE block finalize the field.
  226. $quoted = FALSE;
  227. $currentIndex = $nextQuoteIndex + 1;
  228. }
  229. }
  230. else { // $quoted == FALSE
  231. // First, let's find out where the next character of interest is.
  232. $nextQuoteIndex = strpos($line, '"', $currentIndex);
  233. $nextDelimiterIndex = strpos($line, $this->delimiter, $currentIndex);
  234. if ($nextQuoteIndex === FALSE) {
  235. $nextIndex = $nextDelimiterIndex;
  236. }
  237. elseif ($nextDelimiterIndex === FALSE) {
  238. $nextIndex = $nextQuoteIndex;
  239. }
  240. else {
  241. $nextIndex = min($nextQuoteIndex, $nextDelimiterIndex);
  242. }
  243. if ($nextIndex === FALSE) {
  244. // This line is done, add the rest of it as last field.
  245. $currentField .= substr($line, $currentIndex);
  246. $fields[] = $currentField;
  247. break;
  248. }
  249. elseif ($line[$nextIndex] === $this->delimiter[0]) {
  250. $length = ($nextIndex + strlen($this->delimiter) - 1) - $currentIndex;
  251. $currentField .= substr($line, $currentIndex, $length);
  252. $fields[] = $currentField;
  253. $currentField = '';
  254. $currentIndex += $length + 1;
  255. // Continue with the next field.
  256. }
  257. else { // $line[$nextIndex] == '"'
  258. $quoted = TRUE;
  259. $currentField .= substr($line, $currentIndex, $nextIndex - $currentIndex);
  260. $currentIndex = $nextIndex + 1;
  261. // Continue this field in the $quoted == TRUE block.
  262. }
  263. }
  264. }
  265. // End of CSV parser. We've now got all the fields of the line as strings
  266. // in the $fields array.
  267. if (empty($this->columnNames)) {
  268. $row = $fields;
  269. }
  270. else {
  271. $row = array();
  272. foreach ($this->columnNames as $columnName) {
  273. $field = array_shift($fields);
  274. $row[$columnName] = isset($field) ? $field : '';
  275. }
  276. }
  277. $rows[] = $row;
  278. // Quit parsing if timeout has been reached or requested lines have been
  279. // reached.
  280. if (!empty($maxTime) && microtime() > $maxTime) {
  281. $this->timeoutReached = TRUE;
  282. $this->lastLinePos = $lineIterator->currentPos();
  283. break;
  284. }
  285. $linesParsed++;
  286. if ($this->lineLimit && $linesParsed >= $this->lineLimit) {
  287. $this->lastLinePos = $lineIterator->currentPos();
  288. break;
  289. }
  290. }
  291. return $rows;
  292. }
  293. }