allow custom separator, escape, quote chars
parent
31c847466e
commit
9ac30d9020
|
@ -5,7 +5,7 @@
|
|||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry kind="lib" path="lib/default/hadoop-core.jar"/>
|
||||
<classpathentry kind="lib" path="lib/default/hive-exec.jar"/>
|
||||
<classpathentry kind="lib" path="lib/default/opencsv.jar"/>
|
||||
<classpathentry kind="lib" path="lib/default/opencsv.jar" sourcepath="lib/sources/opencsv-src.jar"/>
|
||||
<classpathentry kind="lib" path="lib/test/junit.jar"/>
|
||||
<classpathentry kind="lib" path="lib/buildtime/log4j.jar"/>
|
||||
<classpathentry kind="lib" path="lib/test/commons-logging.jar"/>
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
lib
|
||||
target
|
2
ivy.xml
2
ivy.xml
|
@ -19,7 +19,7 @@
|
|||
|
||||
<dependencies defaultconf="default->default;hadoop->default;sources->sources()">
|
||||
<!-- default (compile, test, runtime) dependencies -->
|
||||
<dependency org="opencsv" name="opencsv" rev="2.2" />
|
||||
<dependency org="opencsv" name="opencsv" rev="2.3" />
|
||||
|
||||
<dependency org="apache" name="hive" rev="0.5.0" conf="default->exec" />
|
||||
|
||||
|
|
|
@ -4,4 +4,4 @@
|
|||
# The location of the common build system
|
||||
common.build.dir=${basedir}/../common-build
|
||||
|
||||
project.version=1.1.1
|
||||
project.version=1.1.2
|
||||
|
|
|
@ -2,7 +2,9 @@ package com.bizo.hive.serde.csv;
|
|||
|
||||
import java.io.CharArrayReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringWriter;
|
||||
import java.io.Writer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
@ -39,10 +41,14 @@ public final class CSVSerde implements SerDe {
|
|||
private int numCols;
|
||||
private List<String> row;
|
||||
|
||||
private char separatorChar;
|
||||
private char quoteChar;
|
||||
private char escapeChar;
|
||||
|
||||
|
||||
@Override
|
||||
public void initialize(final Configuration conf, final Properties tbl) throws SerDeException {
|
||||
final List<String> columnNames = Arrays.asList(tbl.getProperty(Constants.LIST_COLUMNS).split(","));
|
||||
|
||||
final List<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(tbl.getProperty(Constants.LIST_COLUMN_TYPES));
|
||||
|
||||
numCols = columnNames.size();
|
||||
|
@ -60,7 +66,21 @@ public final class CSVSerde implements SerDe {
|
|||
for (int i=0; i< numCols; i++) {
|
||||
row.add(null);
|
||||
}
|
||||
}
|
||||
|
||||
separatorChar = getProperty(tbl, "separatorChar", CSVWriter.DEFAULT_SEPARATOR);
|
||||
quoteChar = getProperty(tbl, "quoteChar", CSVWriter.DEFAULT_QUOTE_CHARACTER);
|
||||
escapeChar = getProperty(tbl, "escapeChar", CSVWriter.DEFAULT_ESCAPE_CHARACTER);
|
||||
}
|
||||
|
||||
private final char getProperty(final Properties tbl, final String property, final char def) {
|
||||
final String val = tbl.getProperty(property);
|
||||
|
||||
if (val != null) {
|
||||
return val.charAt(0);
|
||||
}
|
||||
|
||||
return def;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
|
||||
|
@ -86,7 +106,7 @@ public final class CSVSerde implements SerDe {
|
|||
}
|
||||
|
||||
final StringWriter writer = new StringWriter();
|
||||
final CSVWriter csv = new CSVWriter(writer, CSVWriter.DEFAULT_SEPARATOR, CSVWriter.DEFAULT_QUOTE_CHARACTER, "");
|
||||
final CSVWriter csv = newWriter(writer, separatorChar, quoteChar, escapeChar);
|
||||
|
||||
try {
|
||||
csv.writeNext(outputFields);
|
||||
|
@ -104,7 +124,7 @@ public final class CSVSerde implements SerDe {
|
|||
|
||||
CSVReader csv = null;
|
||||
try {
|
||||
csv = new CSVReader(new CharArrayReader(rowText.toString().toCharArray()));
|
||||
csv = newReader(new CharArrayReader(rowText.toString().toCharArray()), separatorChar, quoteChar, escapeChar);
|
||||
final String[] read = csv.readNext();
|
||||
|
||||
for (int i=0; i< numCols; i++) {
|
||||
|
@ -128,6 +148,24 @@ public final class CSVSerde implements SerDe {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private CSVReader newReader(final Reader reader, char separator, char quote, char escape) {
|
||||
// CSVReader will throw an exception if any of separator, quote, or escape is the same, but
|
||||
// the CSV format specifies that the escape character and quote char are the same... very weird
|
||||
if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
|
||||
return new CSVReader(reader, separator, quote);
|
||||
} else {
|
||||
return new CSVReader(reader, separator, quote, escape);
|
||||
}
|
||||
}
|
||||
|
||||
private CSVWriter newWriter(final Writer writer, char separator, char quote, char escape) {
|
||||
if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
|
||||
return new CSVWriter(writer, separator, quote, "");
|
||||
} else {
|
||||
return new CSVWriter(writer, separator, quote, escape, "");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ObjectInspector getObjectInspector() throws SerDeException {
|
||||
|
|
|
@ -12,24 +12,54 @@ import static org.junit.Assert.assertEquals;
|
|||
|
||||
public final class CSVSerdeTest {
|
||||
private final CSVSerde csv = new CSVSerde();
|
||||
final Properties props = new Properties();
|
||||
|
||||
@Before
|
||||
public void setup() throws Exception {
|
||||
final Properties props = new Properties();
|
||||
props.put(Constants.LIST_COLUMNS, "a,b,c");
|
||||
props.put(Constants.LIST_COLUMN_TYPES, "string,string,string");
|
||||
|
||||
csv.initialize(null, props);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDeserialize() throws Exception {
|
||||
csv.initialize(null, props);
|
||||
final Text in = new Text("hello,\"yes, okay\",1");
|
||||
|
||||
final List<String> row = (List<String>) csv.deserialize(in);
|
||||
|
||||
|
||||
assertEquals("hello", row.get(0));
|
||||
assertEquals("yes, okay", row.get(1));
|
||||
assertEquals("1", row.get(2));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDeserializeCustomSeparators() throws Exception {
|
||||
props.put("separatorChar", "\t");
|
||||
props.put("quoteChar", "'");
|
||||
|
||||
csv.initialize(null, props);
|
||||
|
||||
final Text in = new Text("hello\t'yes\tokay'\t1");
|
||||
final List<String> row = (List<String>) csv.deserialize(in);
|
||||
|
||||
assertEquals("hello", row.get(0));
|
||||
assertEquals("yes\tokay", row.get(1));
|
||||
assertEquals("1", row.get(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDeserializeCustomEscape() throws Exception {
|
||||
props.put("quoteChar", "'");
|
||||
props.put("escapeChar", "\\");
|
||||
|
||||
csv.initialize(null, props);
|
||||
|
||||
final Text in = new Text("hello,'yes\\'okay',1");
|
||||
final List<String> row = (List<String>) csv.deserialize(in);
|
||||
|
||||
assertEquals("hello", row.get(0));
|
||||
assertEquals("yes'okay", row.get(1));
|
||||
assertEquals("1", row.get(2));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue