Compare commits

...

6 Commits

Author SHA1 Message Date
Abhinav Sarkar 8daf918fc8 Added line end delimiter and null string support 2014-06-07 17:07:38 +05:30
Larry Ogrodnek f315c1ae4b Merge pull request #13 from kenahoo/patch-1
Tell the user what the default quote/escape/sep characters are
2014-02-18 20:10:23 -08:00
Ken Williams 478a6b1941 Tell the user what the default quote/escape/sep characters are
Since there are various versions of the CSV "standard" floating around - most notably how embedded quotes are escaped.
2014-02-14 17:47:46 -06:00
Larry Ogrodnek bd9c5011ce move hive/hadoop deps to provided 2013-10-30 22:56:07 -07:00
Larry Ogrodnek 3883772cec update readme re: hive versions 2013-10-30 13:05:21 -07:00
Larry Ogrodnek 339658b5f1 build against hadoop 1.0.3 / hive 0.11.0 2013-10-30 12:54:52 -07:00
4 changed files with 151 additions and 143 deletions

175
pom.xml
View File

@ -1,91 +1,96 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.bizo</groupId>
<artifactId>csv-serde</artifactId>
<version>1.1.2-0.8.1</version>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>0.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>0.20.2</version>
</dependency>
<dependency>
<groupId>net.sf.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>2.3</version>
</dependency>
<dependency>
<groupId>javax.jdo</groupId>
<artifactId>jdo2-api</artifactId>
<version>2.3-20090302111651</version>
</dependency>
</dependencies>
<properties>
<!-- use UTF-8 for everything -->
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<build>
<plugins>
<modelVersion>4.0.0</modelVersion>
<groupId>com.bizo</groupId>
<artifactId>csv-serde</artifactId>
<version>1.1.2-0.11.0</version>
<dependencies>
<dependency>
<groupId>net.sf.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>2.3</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>0.11.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.0.3</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.jdo</groupId>
<artifactId>jdo2-api</artifactId>
<version>2.3-20090302111651</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<properties>
<!-- use UTF-8 for everything -->
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.2.1</version>
<configuration>
<descriptors>
<descriptor>src/main/assembly/jar-with-dependencies.xml</descriptor>
</descriptors>
</configuration>
<executions>
<execution>
<id>assemble-all</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.0</version>
<configuration>
<reportPlugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.2.1</version>
<configuration>
<descriptors>
<descriptor>src/main/assembly/jar-with-dependencies.xml</descriptor>
</descriptors>
</configuration>
<executions>
<execution>
<id>assemble-all</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.0</version>
<groupId>org.codehaus.mojo</groupId>
<artifactId>cobertura-maven-plugin</artifactId>
<version>2.5.1</version>
<configuration>
<reportPlugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>cobertura-maven-plugin</artifactId>
<version>2.5.1</version>
<configuration>
<formats>
<format>html</format>
<format>xml</format>
</formats>
</configuration>
</plugin>
</reportPlugins>
<formats>
<format>html</format>
<format>xml</format>
</formats>
</configuration>
</plugin>
</plugins>
</build>
</project>
</reportPlugins>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -20,6 +20,14 @@ create table my_table(a string, b string, ...)
### Custom formatting
The default separator, quote, and escape characters from the `opencsv` library are:
```
DEFAULT_ESCAPE_CHARACTER \
DEFAULT_QUOTE_CHARACTER "
DEFAULT_SEPARATOR ,
```
You can also specify custom separator, quote, or escape characters.
```
@ -38,11 +46,9 @@ create table my_table(a string, b string, ...)
## Files
The following include opencsv along with the serde, so only the single jar is needed. Different builds are needed depending on your version of Hive.
The following include opencsv along with the serde, so only the single jar is needed. Currently built against Hive 0.11.0, but should be compatible with other hive versions.
### Hive 0.8.*
* [csv-serde-1.1.2-0.8.1-all.jar](https://drone.io/github.com/ogrodnek/csv-serde/files/target/csv-serde-1.1.2-0.8.1-all.jar)
* [csv-serde-1.1.2-0.11.0-all.jar](https://drone.io/github.com/ogrodnek/csv-serde/files/target/csv-serde-1.1.2-0.11.0-all.jar)
## Building
@ -56,4 +62,4 @@ Run `mvn eclipse:eclipse` to generate `.project` and `.classpath` files for ecli
## License
csv-serde is open source and licensed under the [Apache 2 License](http://www.apache.org/licenses/LICENSE-2.0.html).
csv-serde is open source and licensed under the [Apache 2 License](http://www.apache.org/licenses/LICENSE-2.0.html).

View File

@ -12,10 +12,6 @@
<useProjectArtifact>true</useProjectArtifact>
<unpack>true</unpack>
<scope>runtime</scope>
<includes>
<include>net.sf.opencsv:opencsv</include>
<include>com.bizo:csv-serde</include>
</includes>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -1,15 +1,7 @@
package com.bizo.hive.serde.csv;
import java.io.CharArrayReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.CSVWriter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.SerDe;
@ -26,68 +18,74 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.CSVWriter;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
/**
* CSVSerde uses opencsv (http://opencsv.sourceforge.net/) to serialize/deserialize columns as CSV.
*
*
* @author Larry Ogrodnek <ogrodnek@gmail.com>
*/
public final class CSVSerde implements SerDe {
private ObjectInspector inspector;
private String[] outputFields;
private int numCols;
private List<String> row;
private char separatorChar;
private char quoteChar;
private char escapeChar;
private String lineEnd;
private String nullDefinedAs;
@Override
public void initialize(final Configuration conf, final Properties tbl) throws SerDeException {
final List<String> columnNames = Arrays.asList(tbl.getProperty(Constants.LIST_COLUMNS).split(","));
final List<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(tbl.getProperty(Constants.LIST_COLUMN_TYPES));
numCols = columnNames.size();
final List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(numCols);
for (int i=0; i< numCols; i++) {
columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
}
this.inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs);
this.outputFields = new String[numCols];
row = new ArrayList<String>(numCols);
for (int i=0; i< numCols; i++) {
row.add(null);
}
separatorChar = getProperty(tbl, "separatorChar", CSVWriter.DEFAULT_SEPARATOR);
quoteChar = getProperty(tbl, "quoteChar", CSVWriter.DEFAULT_QUOTE_CHARACTER);
escapeChar = getProperty(tbl, "escapeChar", CSVWriter.DEFAULT_ESCAPE_CHARACTER);
lineEnd = tbl.getProperty("lineEnd", CSVWriter.DEFAULT_LINE_END);
nullDefinedAs = tbl.getProperty("nullDefinedAs", "\\N");
}
private final char getProperty(final Properties tbl, final String property, final char def) {
final String val = tbl.getProperty(property);
if (val != null) {
return val.charAt(0);
}
return def;
}
@Override
public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
final StructObjectInspector outputRowOI = (StructObjectInspector) objInspector;
final List<? extends StructField> outputFieldRefs = outputRowOI.getAllStructFieldRefs();
if (outputFieldRefs.size() != numCols) {
throw new SerDeException("Cannot serialize the object because there are "
+ outputFieldRefs.size() + " fields but the table has " + numCols + " columns.");
@ -97,45 +95,48 @@ public final class CSVSerde implements SerDe {
for (int c = 0; c < numCols; c++) {
final Object field = outputRowOI.getStructFieldData(obj, outputFieldRefs.get(c));
final ObjectInspector fieldOI = outputFieldRefs.get(c).getFieldObjectInspector();
// The data must be of type String
final StringObjectInspector fieldStringOI = (StringObjectInspector) fieldOI;
// Convert the field to Java class String, because objects of String type
// can be stored in String, Text, or some other classes.
outputFields[c] = fieldStringOI.getPrimitiveJavaObject(field);
if (outputFields[c] == null) {
outputFields[c] = nullDefinedAs;
}
}
final StringWriter writer = new StringWriter();
final CSVWriter csv = newWriter(writer, separatorChar, quoteChar, escapeChar);
final CSVWriter csv = newWriter(writer, separatorChar, quoteChar, escapeChar, lineEnd);
try {
csv.writeNext(outputFields);
csv.close();
return new Text(writer.toString());
} catch (final IOException ioe) {
throw new SerDeException(ioe);
}
}
}
@Override
public Object deserialize(final Writable blob) throws SerDeException {
Text rowText = (Text) blob;
CSVReader csv = null;
try {
csv = newReader(new CharArrayReader(rowText.toString().toCharArray()), separatorChar, quoteChar, escapeChar);
csv = newReader(new CharArrayReader(rowText.toString().toCharArray()), separatorChar, quoteChar, escapeChar);
final String[] read = csv.readNext();
for (int i=0; i< numCols; i++) {
if (read != null && i < read.length) {
row.set(i, read[i]);
row.set(i, read[i] == nullDefinedAs ? null : read[i]);
} else {
row.set(i, null);
}
}
return row;
} catch (final Exception e) {
throw new SerDeException(e);
@ -149,22 +150,22 @@ public final class CSVSerde implements SerDe {
}
}
}
private CSVReader newReader(final Reader reader, char separator, char quote, char escape) {
// CSVReader will throw an exception if any of separator, quote, or escape is the same, but
private static CSVReader newReader(final Reader reader, char separator, char quote, char escape) {
// CSVReader will throw an exception if any of separator, quote, or escape is the same, but
// the CSV format specifies that the escape character and quote char are the same... very weird
if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
return new CSVReader(reader, separator, quote);
} else {
return new CSVReader(reader, separator, quote, escape);
return new CSVReader(reader, separator, quote, escape);
}
}
private CSVWriter newWriter(final Writer writer, char separator, char quote, char escape) {
private static CSVWriter newWriter(final Writer writer, char separator, char quote, char escape, String lineEnd) {
if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
return new CSVWriter(writer, separator, quote, "");
return new CSVWriter(writer, separator, quote, lineEnd);
} else {
return new CSVWriter(writer, separator, quote, escape, "");
return new CSVWriter(writer, separator, quote, escape, lineEnd);
}
}
@ -177,7 +178,7 @@ public final class CSVSerde implements SerDe {
public Class<? extends Writable> getSerializedClass() {
return Text.class;
}
public SerDeStats getSerDeStats() {
return null;
}