allow custom separator, escape, quote chars

hive-0.8
Larry Ogrodnek 2011-12-16 13:35:38 -08:00
parent 31c847466e
commit 9ac30d9020
6 changed files with 81 additions and 11 deletions

View File

@ -5,7 +5,7 @@
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/default/hadoop-core.jar"/>
<classpathentry kind="lib" path="lib/default/hive-exec.jar"/>
<classpathentry kind="lib" path="lib/default/opencsv.jar"/>
<classpathentry kind="lib" path="lib/default/opencsv.jar" sourcepath="lib/sources/opencsv-src.jar"/>
<classpathentry kind="lib" path="lib/test/junit.jar"/>
<classpathentry kind="lib" path="lib/buildtime/log4j.jar"/>
<classpathentry kind="lib" path="lib/test/commons-logging.jar"/>

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
lib
target

View File

@ -19,7 +19,7 @@
<dependencies defaultconf="default->default;hadoop->default;sources->sources()">
<!-- default (compile, test, runtime) dependencies -->
<dependency org="opencsv" name="opencsv" rev="2.2" />
<dependency org="opencsv" name="opencsv" rev="2.3" />
<dependency org="apache" name="hive" rev="0.5.0" conf="default->exec" />

View File

@ -4,4 +4,4 @@
# The location of the common build system
common.build.dir=${basedir}/../common-build
project.version=1.1.1
project.version=1.1.2

View File

@ -2,7 +2,9 @@ package com.bizo.hive.serde.csv;
import java.io.CharArrayReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -39,10 +41,14 @@ public final class CSVSerde implements SerDe {
private int numCols;
private List<String> row;
private char separatorChar;
private char quoteChar;
private char escapeChar;
@Override
public void initialize(final Configuration conf, final Properties tbl) throws SerDeException {
final List<String> columnNames = Arrays.asList(tbl.getProperty(Constants.LIST_COLUMNS).split(","));
final List<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(tbl.getProperty(Constants.LIST_COLUMN_TYPES));
numCols = columnNames.size();
@ -60,7 +66,21 @@ public final class CSVSerde implements SerDe {
for (int i=0; i< numCols; i++) {
row.add(null);
}
}
separatorChar = getProperty(tbl, "separatorChar", CSVWriter.DEFAULT_SEPARATOR);
quoteChar = getProperty(tbl, "quoteChar", CSVWriter.DEFAULT_QUOTE_CHARACTER);
escapeChar = getProperty(tbl, "escapeChar", CSVWriter.DEFAULT_ESCAPE_CHARACTER);
}
private final char getProperty(final Properties tbl, final String property, final char def) {
final String val = tbl.getProperty(property);
if (val != null) {
return val.charAt(0);
}
return def;
}
@Override
public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
@ -86,7 +106,7 @@ public final class CSVSerde implements SerDe {
}
final StringWriter writer = new StringWriter();
final CSVWriter csv = new CSVWriter(writer, CSVWriter.DEFAULT_SEPARATOR, CSVWriter.DEFAULT_QUOTE_CHARACTER, "");
final CSVWriter csv = newWriter(writer, separatorChar, quoteChar, escapeChar);
try {
csv.writeNext(outputFields);
@ -104,7 +124,7 @@ public final class CSVSerde implements SerDe {
CSVReader csv = null;
try {
csv = new CSVReader(new CharArrayReader(rowText.toString().toCharArray()));
csv = newReader(new CharArrayReader(rowText.toString().toCharArray()), separatorChar, quoteChar, escapeChar);
final String[] read = csv.readNext();
for (int i=0; i< numCols; i++) {
@ -128,6 +148,24 @@ public final class CSVSerde implements SerDe {
}
}
}
private CSVReader newReader(final Reader reader, char separator, char quote, char escape) {
// CSVReader will throw an exception if any of separator, quote, or escape is the same, but
// the CSV format specifies that the escape character and quote char are the same... very weird
if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
return new CSVReader(reader, separator, quote);
} else {
return new CSVReader(reader, separator, quote, escape);
}
}
private CSVWriter newWriter(final Writer writer, char separator, char quote, char escape) {
if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
return new CSVWriter(writer, separator, quote, "");
} else {
return new CSVWriter(writer, separator, quote, escape, "");
}
}
@Override
public ObjectInspector getObjectInspector() throws SerDeException {

View File

@ -12,24 +12,54 @@ import static org.junit.Assert.assertEquals;
public final class CSVSerdeTest {
private final CSVSerde csv = new CSVSerde();
final Properties props = new Properties();
@Before
public void setup() throws Exception {
final Properties props = new Properties();
props.put(Constants.LIST_COLUMNS, "a,b,c");
props.put(Constants.LIST_COLUMN_TYPES, "string,string,string");
csv.initialize(null, props);
}
@Test
public void testDeserialize() throws Exception {
csv.initialize(null, props);
final Text in = new Text("hello,\"yes, okay\",1");
final List<String> row = (List<String>) csv.deserialize(in);
assertEquals("hello", row.get(0));
assertEquals("yes, okay", row.get(1));
assertEquals("1", row.get(2));
}
@Test
public void testDeserializeCustomSeparators() throws Exception {
props.put("separatorChar", "\t");
props.put("quoteChar", "'");
csv.initialize(null, props);
final Text in = new Text("hello\t'yes\tokay'\t1");
final List<String> row = (List<String>) csv.deserialize(in);
assertEquals("hello", row.get(0));
assertEquals("yes\tokay", row.get(1));
assertEquals("1", row.get(2));
}
@Test
public void testDeserializeCustomEscape() throws Exception {
props.put("quoteChar", "'");
props.put("escapeChar", "\\");
csv.initialize(null, props);
final Text in = new Text("hello,'yes\\'okay',1");
final List<String> row = (List<String>) csv.deserialize(in);
assertEquals("hello", row.get(0));
assertEquals("yes'okay", row.get(1));
assertEquals("1", row.get(2));
}
}