41 |
rodolico |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
class CsvImporter
|
|
|
4 |
{
|
|
|
5 |
private $fp;
|
|
|
6 |
private $parse_header;
|
|
|
7 |
private $header;
|
|
|
8 |
private $delimiter;
|
|
|
9 |
private $length;
|
|
|
10 |
private $analysis;
|
|
|
11 |
public $lines;
|
|
|
12 |
|
|
|
13 |
|
|
|
14 |
|
|
|
15 |
//--------------------------------------------------------------------
|
|
|
16 |
function __construct($file_name, $parse_header=false, $delimiter='', $length=8000)
|
|
|
17 |
{
|
|
|
18 |
// auto detect mac file endings
|
|
|
19 |
ini_set('auto_detect_line_endings',TRUE);
|
|
|
20 |
|
|
|
21 |
$this->parse_header = $parse_header;
|
|
|
22 |
$this->delimiter = $delimiter;
|
|
|
23 |
if ( $delimiter === '' ) { # do an auto-detect on the file
|
|
|
24 |
$this->analysis = $this->analyse_file( $file_name );
|
|
|
25 |
$this->delimiter = $this->analysis['delimiter']['value'];
|
|
|
26 |
} else {
|
|
|
27 |
$this->delimiter = $delimiter;
|
|
|
28 |
}
|
|
|
29 |
$this->length = $length;
|
|
|
30 |
$this->lines = 0;
|
|
|
31 |
|
|
|
32 |
$this->fp = fopen($file_name, "r");
|
|
|
33 |
|
|
|
34 |
if ($this->parse_header)
|
|
|
35 |
{
|
|
|
36 |
$this->header = fgetcsv($this->fp, $this->length, $this->delimiter);
|
|
|
37 |
}
|
|
|
38 |
|
|
|
39 |
}
|
|
|
40 |
//--------------------------------------------------------------------
|
|
|
41 |
function __destruct()
|
|
|
42 |
{
|
|
|
43 |
if ($this->fp)
|
|
|
44 |
{
|
|
|
45 |
fclose($this->fp);
|
|
|
46 |
}
|
|
|
47 |
}
|
|
|
48 |
//--------------------------------------------------------------------
|
|
|
49 |
function get($max_lines=0)
|
|
|
50 |
{
|
|
|
51 |
//if $max_lines is set to 0, then get all the data
|
|
|
52 |
|
|
|
53 |
$data = array();
|
|
|
54 |
|
|
|
55 |
if ($max_lines > 0)
|
|
|
56 |
$line_count = 0;
|
|
|
57 |
else
|
|
|
58 |
$line_count = -1; // so loop limit is ignored
|
|
|
59 |
|
|
|
60 |
while ($line_count < $max_lines && ($row = fgetcsv($this->fp, $this->length, $this->delimiter)) !== FALSE)
|
|
|
61 |
{
|
|
|
62 |
if ($this->parse_header)
|
|
|
63 |
{
|
|
|
64 |
foreach ($this->header as $i => $heading_i)
|
|
|
65 |
{
|
|
|
66 |
$row_new[$heading_i] = $row[$i];
|
|
|
67 |
}
|
|
|
68 |
$data[] = $row_new;
|
|
|
69 |
}
|
|
|
70 |
else
|
|
|
71 |
{
|
|
|
72 |
$data[] = $row;
|
|
|
73 |
}
|
|
|
74 |
|
|
|
75 |
if ($max_lines > 0)
|
|
|
76 |
$line_count++;
|
|
|
77 |
}
|
|
|
78 |
$this->lines += $line_count;
|
|
|
79 |
return $data;
|
|
|
80 |
}
|
|
|
81 |
|
|
|
82 |
/*
|
|
|
83 |
* taken from http://php.net/manual/en/function.fgetcsv.php
|
|
|
84 |
* opens $file and reads up to $capture_limit_in_kb bytes
|
|
|
85 |
* then analyzes and attempts to determine line endings and delimiters
|
|
|
86 |
* returns results as an array with keys
|
|
|
87 |
* peak_mem
|
|
|
88 |
* line_endings
|
|
|
89 |
* lines
|
|
|
90 |
* delimiter
|
|
|
91 |
* read_kb
|
|
|
92 |
*/
|
|
|
93 |
function analyse_file($file, $capture_limit_in_kb = 10) {
|
|
|
94 |
// capture starting memory usage
|
|
|
95 |
$output['peak_mem']['start'] = memory_get_peak_usage(true);
|
|
|
96 |
|
|
|
97 |
// log the limit how much of the file was sampled (in Kb)
|
|
|
98 |
$output['read_kb'] = $capture_limit_in_kb;
|
|
|
99 |
|
|
|
100 |
// read in file
|
|
|
101 |
$fh = fopen($file, 'r');
|
|
|
102 |
$contents = fread($fh, ($capture_limit_in_kb * 1024)); // in KB
|
|
|
103 |
fclose($fh);
|
|
|
104 |
|
|
|
105 |
// specify allowed field delimiters
|
|
|
106 |
$delimiters = array(
|
|
|
107 |
'comma' => ',',
|
|
|
108 |
'semicolon' => ';',
|
|
|
109 |
'tab' => "\t",
|
|
|
110 |
'pipe' => '|',
|
|
|
111 |
'colon' => ':'
|
|
|
112 |
);
|
|
|
113 |
|
|
|
114 |
// specify allowed line endings
|
|
|
115 |
$line_endings = array(
|
|
|
116 |
'rn' => "\r\n",
|
|
|
117 |
'n' => "\n",
|
|
|
118 |
'r' => "\r",
|
|
|
119 |
'nr' => "\n\r"
|
|
|
120 |
);
|
|
|
121 |
|
|
|
122 |
// loop and count each line ending instance
|
|
|
123 |
foreach ($line_endings as $key => $value) {
|
|
|
124 |
$line_result[$key] = substr_count($contents, $value);
|
|
|
125 |
}
|
|
|
126 |
|
|
|
127 |
// sort by largest array value
|
|
|
128 |
asort($line_result);
|
|
|
129 |
|
|
|
130 |
// log to output array
|
|
|
131 |
$output['line_ending']['results'] = $line_result;
|
|
|
132 |
$output['line_ending']['count'] = end($line_result);
|
|
|
133 |
$output['line_ending']['key'] = key($line_result);
|
|
|
134 |
$output['line_ending']['value'] = $line_endings[$output['line_ending']['key']];
|
|
|
135 |
$lines = explode($output['line_ending']['value'], $contents);
|
|
|
136 |
|
|
|
137 |
// remove last line of array, as this maybe incomplete?
|
|
|
138 |
array_pop($lines);
|
|
|
139 |
|
|
|
140 |
// create a string from the legal lines
|
|
|
141 |
$complete_lines = implode(' ', $lines);
|
|
|
142 |
|
|
|
143 |
// log statistics to output array
|
|
|
144 |
$output['lines']['count'] = count($lines);
|
|
|
145 |
$output['lines']['length'] = strlen($complete_lines);
|
|
|
146 |
|
|
|
147 |
// loop and count each delimiter instance
|
|
|
148 |
foreach ($delimiters as $delimiter_key => $delimiter) {
|
|
|
149 |
$delimiter_result[$delimiter_key] = substr_count($complete_lines, $delimiter);
|
|
|
150 |
}
|
|
|
151 |
|
|
|
152 |
// sort by largest array value
|
|
|
153 |
asort($delimiter_result);
|
|
|
154 |
|
|
|
155 |
// log statistics to output array with largest counts as the value
|
|
|
156 |
$output['delimiter']['results'] = $delimiter_result;
|
|
|
157 |
$output['delimiter']['count'] = end($delimiter_result);
|
|
|
158 |
$output['delimiter']['key'] = key($delimiter_result);
|
|
|
159 |
$output['delimiter']['value'] = $delimiters[$output['delimiter']['key']];
|
|
|
160 |
|
|
|
161 |
// capture ending memory usage
|
|
|
162 |
$output['peak_mem']['end'] = memory_get_peak_usage(true);
|
|
|
163 |
return $output;
|
|
|
164 |
}
|
|
|
165 |
|
|
|
166 |
} // class CsvImporter
|
|
|
167 |
|
|
|
168 |
|
|
|
169 |
?>
|