How to deal with multibyte in hive 07/19 Update SLTechnology News&Howtos

How to deal with multibyte in hive

2025-07-19 Update From: SLTechnology News&Howtos shulou NAV: SLTechnology News&Howtos > Internet Technology >

Shulou(Shulou.com)06/03 Report--

Now there is a batch of data:

01 | | zs | | 1802 | | ls | | 1903 | | jj | | 10

The delimiter for each row of data is | |, which is a multi-byte delimiter. The default hive only supports a single-byte delimiter. When the data above is | | multibyte, it is not supported.

Solution:

method01: use RegexSerDe to extract fields through regular expressions

# create table statement create table t_bi_reg (id string,name string,age string) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties (' input.regex'=' ((.\ *)\\ |\ | (.\ *)\ |\ | (.\ *))' 'output.format.string'='%1$ s% 2$ s% 3s') # input.regex: specify the rule of sharding # output.format.string: fields output after sharding # load data load data local inpath'/ Hadoop/data/1.txt' into table t_bi_reg# query select * from t_bi_reg

method2: modify the source code:

The principle of is to replace the "multi-byte delimiter" in the data with the hive default delimiter (ctrl+A, that is,\ 001) or the alternative single-character delimiter when inputformat reads rows, so that hive can extract fields according to the default single-byte delimiter during serde operation.

Code

Com.zy.hive.delimit2.BiDelimiterInputFormat:

Package com.zy.hive.delimit2;import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.FileSplit;import org.apache.hadoop.mapred.InputSplit;import org.apache.hadoop.mapred.JobConf;import org.apache.hadoop.mapred.RecordReader;import org.apache.hadoop.mapred.Reporter;import org.apache.hadoop.mapred.TextInputFormat Public class BiDelimiterInputFormat extends TextInputFormat {@ Overridepublic RecordReader getRecordReader (InputSplit genericSplit,JobConf job, Reporter reporter) throws IOException {reporter.setStatus (genericSplit.toString ()); BiRecordReader reader = new BiRecordReader (job, (FileSplit) genericSplit); / / MyRecordReader reader = new MyRecordReader (job, (FileSplit) genericSplit); return reader;}}

Com.zy.hive.delimit2.BiRecordReader

Package com.zy.hive.delimit2;import java.io.IOException;import java.io.InputStream;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.fs.Seekable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text Import org.apache.hadoop.io.compress.CodecPool;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.io.compress.CompressionCodecFactory;import org.apache.hadoop.io.compress.Decompressor;import org.apache.hadoop.io.compress.SplitCompressionInputStream;import org.apache.hadoop.io.compress.SplittableCompressionCodec;import org.apache.hadoop.mapred.FileSplit;import org.apache.hadoop.mapred.LineRecordReader;import org.apache.hadoop.mapred.RecordReader Public class BiRecordReader implements RecordReader {private static final Log LOG = LogFactory.getLog (LineRecordReader.class.getName ()); private CompressionCodecFactory compressionCodecs = null;private long start;private long pos;private long end;private LineReader in;int maxLineLength;private Seekable filePosition;private CompressionCodec codec;private Decompressor decompressor;/*** A class that provides a line reader from an input stream.* @ deprecated Use {@ link org.apache.hadoop.util.LineReader} instead.*/@Deprecatedpublic static class LineReader extends org.apache.hadoop.util.LineReader {LineReader (InputStream in) {super (in) } LineReader (InputStream in, int bufferSize) {super (in, bufferSize);} public LineReader (InputStream in, Configuration conf) throws IOException {Stay hungry Stay foolish-- http://blog.csdn.net/zhongqi2513super(in, conf);}} public BiRecordReader (Configuration job, FileSplit split) throws IOException {this.maxLineLength = job.getInt ("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart (); end = start + split.getLength (); final Path file = split.getPath (); compressionCodecs = new CompressionCodecFactory (job) Codec = compressionCodecs.getCodec (file); / / open the file and seek to the start of the splitFileSystem fs = file.getFileSystem (job); FSDataInputStream fileIn = fs.open (split.getPath ()); if (isCompressedInput ()) {decompressor = CodecPool.getDecompressor (codec); if (codec instanceof SplittableCompressionCodec) {final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec). CreateInputStream (fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader (cIn, job); start = cIn.getAdjustedStart (); end = cIn.getAdjustedEnd (); filePosition = cIn / / take pos from compressed stream} else {in = new LineReader (codec.createInputStream (fileIn,decompressor), job); filePosition = fileIn;}} else {fileIn.seek (start); in = new LineReader (fileIn, job); filePosition = fileIn;} / / If this is not the first split, we always throw away first record// because we always (except the last split) read one extra line in// next () method.if (start! = 0) {start + = in.readLine (new Text (), 0, maxBytesToConsume (start);} this.pos = start Stay hungry Stay foolish-- http://blog.csdn.net/zhongqi2513}private boolean isCompressedInput () {return (codec! = null);} private int maxBytesToConsume (long pos) {return isCompressedInput ()? Integer.MAX_VALUE: (int) Math.min (Integer.MAX_VALUE, end-pos);} private long getFilePosition () throws IOException {long retVal;if (isCompressedInput () & & null! = filePosition) {retVal = filePosition.getPos ();} else {retVal = pos;} return retVal;} public BiRecordReader (InputStream in, long offset, long endOffset,int maxLineLength) {this.maxLineLength = maxLineLength;this.in = new LineReader (in); this.start = offset;this.pos = offset;this.end = endOffset;this.filePosition = null } public BiRecordReader (InputStream in, long offset, long endOffset,Configuration job) throws IOException {this.maxLineLength = job.getInt ("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); this.in = new LineReader (in, job); this.start = offset;this.pos = offset;this.end = endOffset;this.filePosition = null;} public LongWritable createKey () {return new LongWritable (); Stay hungry Stay foolish-- http://blog.csdn.net/zhongqi2513}public Text createValue () {return new Text ();} / * * Read a line. * / public synchronized boolean next (LongWritable key, Text value) throws IOException {/ / We always read one extra line, which lies outside the upper// split limit i.e. (end-1) while (getFilePosition ()

Welcome to subscribe "Shulou Technology Information " to get latest news, interesting things and hot topics in the IT industry, and controls the hottest and latest Internet news, technology news and IT industry trends.

*The comments in the above article only represent the author's personal views and do not represent the views and positions of this website. If you have more insights, please feel free to contribute and share.