Browse Source

Added line end delimiter and null string support

Abhinav Sarkar 6 years ago
parent
commit
8daf918fc8
1 changed files with 50 additions and 49 deletions
  1. 50
    49
      src/main/java/com/bizo/hive/serde/csv/CSVSerde.java

+ 50
- 49
src/main/java/com/bizo/hive/serde/csv/CSVSerde.java View File

@@ -1,15 +1,7 @@
1 1
 package com.bizo.hive.serde.csv;
2 2
 
3
-import java.io.CharArrayReader;
4
-import java.io.IOException;
5
-import java.io.Reader;
6
-import java.io.StringWriter;
7
-import java.io.Writer;
8
-import java.util.ArrayList;
9
-import java.util.Arrays;
10
-import java.util.List;
11
-import java.util.Properties;
12
-
3
+import au.com.bytecode.opencsv.CSVReader;
4
+import au.com.bytecode.opencsv.CSVWriter;
13 5
 import org.apache.hadoop.conf.Configuration;
14 6
 import org.apache.hadoop.hive.serde.Constants;
15 7
 import org.apache.hadoop.hive.serde2.SerDe;
@@ -26,68 +18,74 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
26 18
 import org.apache.hadoop.io.Text;
27 19
 import org.apache.hadoop.io.Writable;
28 20
 
29
-import au.com.bytecode.opencsv.CSVReader;
30
-import au.com.bytecode.opencsv.CSVWriter;
21
+import java.io.*;
22
+import java.util.ArrayList;
23
+import java.util.Arrays;
24
+import java.util.List;
25
+import java.util.Properties;
31 26
 
32 27
 
33 28
 /**
34 29
  * CSVSerde uses opencsv (http://opencsv.sourceforge.net/) to serialize/deserialize columns as CSV.
35
- * 
30
+ *
36 31
  * @author Larry Ogrodnek <ogrodnek@gmail.com>
37 32
  */
38 33
 public final class CSVSerde implements SerDe {
39
-  
34
+
40 35
   private ObjectInspector inspector;
41 36
   private String[] outputFields;
42 37
   private int numCols;
43 38
   private List<String> row;
44
-  
39
+
45 40
   private char separatorChar;
46 41
   private char quoteChar;
47 42
   private char escapeChar;
48
-  
49
-    
43
+  private String lineEnd;
44
+  private String nullDefinedAs;
45
+
50 46
   @Override
51 47
   public void initialize(final Configuration conf, final Properties tbl) throws SerDeException {
52 48
     final List<String> columnNames = Arrays.asList(tbl.getProperty(Constants.LIST_COLUMNS).split(","));
53 49
     final List<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(tbl.getProperty(Constants.LIST_COLUMN_TYPES));
54
-    
50
+
55 51
     numCols = columnNames.size();
56
-    
52
+
57 53
     final List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(numCols);
58
-    
54
+
59 55
     for (int i=0; i< numCols; i++) {
60 56
       columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
61 57
     }
62
-    
58
+
63 59
     this.inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs);
64 60
     this.outputFields = new String[numCols];
65 61
     row = new ArrayList<String>(numCols);
66
-    
62
+
67 63
     for (int i=0; i< numCols; i++) {
68 64
       row.add(null);
69 65
     }
70
-    
66
+
71 67
     separatorChar = getProperty(tbl, "separatorChar", CSVWriter.DEFAULT_SEPARATOR);
72 68
     quoteChar = getProperty(tbl, "quoteChar", CSVWriter.DEFAULT_QUOTE_CHARACTER);
73 69
     escapeChar = getProperty(tbl, "escapeChar", CSVWriter.DEFAULT_ESCAPE_CHARACTER);
70
+    lineEnd = tbl.getProperty("lineEnd", CSVWriter.DEFAULT_LINE_END);
71
+    nullDefinedAs = tbl.getProperty("nullDefinedAs", "\\N");
74 72
   }
75
-  
73
+
76 74
   private final char getProperty(final Properties tbl, final String property, final char def) {
77 75
     final String val = tbl.getProperty(property);
78
-    
76
+
79 77
     if (val != null) {
80 78
       return val.charAt(0);
81 79
     }
82
-    
80
+
83 81
     return def;
84 82
   }
85
-  
83
+
86 84
   @Override
87 85
   public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
88 86
     final StructObjectInspector outputRowOI = (StructObjectInspector) objInspector;
89 87
     final List<? extends StructField> outputFieldRefs = outputRowOI.getAllStructFieldRefs();
90
-    
88
+
91 89
     if (outputFieldRefs.size() != numCols) {
92 90
       throw new SerDeException("Cannot serialize the object because there are "
93 91
           + outputFieldRefs.size() + " fields but the table has " + numCols + " columns.");
@@ -97,45 +95,48 @@ public final class CSVSerde implements SerDe {
97 95
     for (int c = 0; c < numCols; c++) {
98 96
       final Object field = outputRowOI.getStructFieldData(obj, outputFieldRefs.get(c));
99 97
       final ObjectInspector fieldOI = outputFieldRefs.get(c).getFieldObjectInspector();
100
-      
98
+
101 99
       // The data must be of type String
102 100
       final StringObjectInspector fieldStringOI = (StringObjectInspector) fieldOI;
103
-      
101
+
104 102
       // Convert the field to Java class String, because objects of String type
105 103
       // can be stored in String, Text, or some other classes.
106 104
       outputFields[c] = fieldStringOI.getPrimitiveJavaObject(field);
105
+      if (outputFields[c] == null) {
106
+          outputFields[c] = nullDefinedAs;
107
+      }
107 108
     }
108
-    
109
+
109 110
     final StringWriter writer = new StringWriter();
110
-    final CSVWriter csv = newWriter(writer, separatorChar, quoteChar, escapeChar);
111
-    
111
+    final CSVWriter csv = newWriter(writer, separatorChar, quoteChar, escapeChar, lineEnd);
112
+
112 113
     try {
113 114
       csv.writeNext(outputFields);
114 115
       csv.close();
115
-      
116
+
116 117
       return new Text(writer.toString());
117 118
     } catch (final IOException ioe) {
118 119
       throw new SerDeException(ioe);
119 120
     }
120
-  }  
121
+  }
121 122
 
122 123
   @Override
123 124
   public Object deserialize(final Writable blob) throws SerDeException {
124 125
     Text rowText = (Text) blob;
125
-    
126
+
126 127
     CSVReader csv = null;
127 128
     try {
128
-      csv = newReader(new CharArrayReader(rowText.toString().toCharArray()), separatorChar, quoteChar, escapeChar);      
129
+      csv = newReader(new CharArrayReader(rowText.toString().toCharArray()), separatorChar, quoteChar, escapeChar);
129 130
       final String[] read = csv.readNext();
130
-      
131
+
131 132
       for (int i=0; i< numCols; i++) {
132 133
         if (read != null && i < read.length) {
133
-          row.set(i, read[i]);
134
+          row.set(i, read[i] == nullDefinedAs ? null : read[i]);
134 135
         } else {
135 136
           row.set(i, null);
136 137
         }
137 138
       }
138
-      
139
+
139 140
       return row;
140 141
     } catch (final Exception e) {
141 142
       throw new SerDeException(e);
@@ -149,22 +150,22 @@ public final class CSVSerde implements SerDe {
149 150
       }
150 151
     }
151 152
   }
152
-  
153
-  private CSVReader newReader(final Reader reader, char separator, char quote, char escape) {
154
-    // CSVReader will throw an exception if any of separator, quote, or escape is the same, but 
153
+
154
+  private static CSVReader newReader(final Reader reader, char separator, char quote, char escape) {
155
+    // CSVReader will throw an exception if any of separator, quote, or escape is the same, but
155 156
     // the CSV format specifies that the escape character and quote char are the same... very weird
156 157
     if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
157 158
       return new CSVReader(reader, separator, quote);
158 159
     } else {
159
-      return new CSVReader(reader, separator, quote, escape);      
160
+      return new CSVReader(reader, separator, quote, escape);
160 161
     }
161 162
   }
162
-  
163
-  private CSVWriter newWriter(final Writer writer, char separator, char quote, char escape) {
163
+
164
+  private static CSVWriter newWriter(final Writer writer, char separator, char quote, char escape, String lineEnd) {
164 165
     if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
165
-      return new CSVWriter(writer, separator, quote, "");
166
+      return new CSVWriter(writer, separator, quote, lineEnd);
166 167
     } else {
167
-      return new CSVWriter(writer, separator, quote, escape, "");      
168
+      return new CSVWriter(writer, separator, quote, escape, lineEnd);
168 169
     }
169 170
   }
170 171
 
@@ -177,7 +178,7 @@ public final class CSVSerde implements SerDe {
177 178
   public Class<? extends Writable> getSerializedClass() {
178 179
     return Text.class;
179 180
   }
180
-  
181
+
181 182
   public SerDeStats getSerDeStats() {
182 183
     return null;
183 184
   }

Loading…
Cancel
Save