mapreduce quadratic sorting

Posted by steply on Thu, 08 Aug 2019 12:00:03 +0200

1 quadratic ranking
1.1 Thought

The so-called secondary sorting uses the second field to sort the same data in the first field.
For example, the e-commerce platform records the amount of each order for each user. Now it requires all orders belonging to the same user to be sorted, and the user name of the output to be sorted.

Account order amount
hadoop@apache 200
hive@apache 550
yarn@apache 580
hive@apache 159
hadoop@apache 300
hive@apache 258
hadoop@apache 300
yarn@apache 100
hadoop@apache 150
yarn@apache 560
yarn@apache 260
Result after quadratic ordering

Account order amount
hadoop@apache 150
hadoop@apache 200
hadoop@apache 300
hadoop@apache 300
hive@apache 159
hive@apache 258
hive@apache 550
yarn@apache 100
yarn@apache 260
yarn@apache 560
yarn@apache 580
The idea of implementation is to use custom key, which realizes the sorting of two fields according to user name and order amount, custom partition and grouping classification, and partition and grouping according to user name. A comparator for custom sorting, which is used for merging sorting at the map end and reduce, respectively.

Because hadoop uses string serialization java.io.DataOutputStream.writeUTF() by default and uses "variant UTF encoding", serialized byte streams cannot be used in RawComparator.
In the implementation, a flexible method is used to directly use the byte stream of the Account field and serialize the length of the byte stream. The byte stream that RawComparator gets is the byte stream that we write in. Of course, when deserializing, you need to read the Account field based on this length.

1.2 Implementation

Program Code

package com.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.charset.Charset;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class SecondarySortMapReduce extends Configured implements Tool {

/**
 * Consumption Information
 * @author Ivan
 *
 */
public static class CostBean implements WritableComparable<CostBean> {
    private String account;
    private double cost;
    
    public void set(String account, double cost) {
        this.account = account;
        this.cost = cost;
    }
    
    public String getAccount() {
        return account;
    }
    
    public double getCost() {
        return cost;
    }
    
    @Override
    public void write(DataOutput out) throws IOException {
        byte[] buffer = account.getBytes(Charset.forName("UTF-8"));
        
        out.writeInt(buffer.length);                // The byte stream length of the account. out.writeUTF() uses a complex encoding method, requiring DataInput.readUTF() to decode, which is not used here.
        out.write(buffer);
        out.writeDouble(cost);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        int accountLength = in.readInt();
        byte[] bytes = new byte[accountLength];
        in.readFully(bytes);
        
        account = new String(bytes);        
        cost = in.readDouble();
    }

    @Override
    public int compareTo(CostBean o) {
        if (account.equals(o.account)) {        //Accounts are equal. Next, compare the amount of consumption. 
            return cost == o.cost ? 0 : (cost > o.cost ? 1 : -1);
        }
        
        return account.compareTo(o.account);
    }
    
    @Override
    public String toString() {
        return account + "\t" + cost;
    }
}

/**
 * Comparator for map-side and reduce-side sorting: compare amounts if accounts are the same
 * @author Ivan
 *
 */
public static class CostBeanComparator extends WritableComparator {
    @Override
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
        int accountLength1 = readInt(b1, s1);  
        int accountLength2 = readInt(b2, s2);
        
        int result = compareBytes(b1, s1 + 4, accountLength1, b2, s2 + 4, accountLength2);
        if (result == 0) {  // If the account is the same, the amount will be compared. 
            double thisValue = readDouble(b1, s1 + 4 + accountLength1);
            double thatValue = readDouble(b2, s2 + 4 + accountLength2);
            return (thisValue < thatValue ? -1 : (thisValue == thatValue ? 0 : 1));
        } else {            
            return result;
        }
    }
}

/**
 * A partitioner for map-side disk writing
 * @author Ivan
 *
 */
public static class CostBeanPatitioner extends Partitioner<CostBean, DoubleWritable> {
    
    /**
     * Partition by account
     */
    @Override
    public int getPartition(CostBean key, DoubleWritable value, int numPartitions) {
        return key.account.hashCode() % numPartitions;
    }
}

/**
 * Comparators for grouping at the reduce end are grouped according to the account field, i.e., the same account is grouped as a group
 * @author Ivan
 *
 */
public static class GroupComparator extends WritableComparator {
    @Override
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
        int accountLength1 = readInt(b1, s1);  
        int accountLength2 = readInt(b2, s2);
        
        byte[] tmpb1 = new byte[accountLength1];
        byte[] tmpb2 = new byte[accountLength2];
        System.arraycopy(b1, s1 + 4, tmpb1, 0, accountLength1);
        System.arraycopy(b2, s2 + 4, tmpb2, 0, accountLength2);
        
        String account1 = new String(tmpb1, Charset.forName("UTF-8"));
        String account2 = new String(tmpb1, Charset.forName("UTF-8"));
        
        System.out.println("grouping: accout1=" + account1 + ", accout2=" + account2);
        
        return compareBytes(b1, s1 + 4, accountLength1, b2, s2 + 4, accountLength2);
    }
}

/**
 * Mapper class
 * @author Ivan
 *
 */
public static class SecondarySortMapper extends Mapper<LongWritable, Text, CostBean, DoubleWritable> {
    private final CostBean outputKey = new CostBean();
    private final DoubleWritable outputValue = new DoubleWritable();
    
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        String[] data = value.toString().split("\t");
        
        double cost = Double.parseDouble(data[1]);
        outputKey.set(data[0].trim(), cost);
        outputValue.set(cost);          

        context.write(outputKey, outputValue);
    }
}

public static class SecondarySortReducer extends Reducer<CostBean, DoubleWritable, Text, DoubleWritable> {
    private final Text outputKey = new Text();
    private final DoubleWritable outputValue = new DoubleWritable();
    @Override
    protected void reduce(CostBean key, Iterable<DoubleWritable> values,Context context)
            throws IOException, InterruptedException {
        outputKey.set(key.getAccount());
        
        for (DoubleWritable v : values) {
            outputValue.set(v.get());
            context.write(outputKey, outputValue);
        }
    }
}

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf, SecondarySortMapReduce.class.getSimpleName());
    job.setJarByClass(SecondarySortMapReduce.class);
    
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
    // map settings
    job.setMapperClass(SecondarySortMapper.class);
    job.setMapOutputKeyClass(CostBean.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    
    // partition settings
    job.setPartitionerClass(CostBeanPatitioner.class);
    
    // sorting      
    job.setSortComparatorClass(CostBeanComparator.class);
    
    // grouping
    
    job.setGroupingComparatorClass(GroupComparator.class);
    
    // reduce settings
    job.setReducerClass(SecondarySortReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputKeyClass(DoubleWritable.class);
    
    boolean res = job.waitForCompletion(true);
    
    return res ? 0 : 1;
}

/**
 * @param args
 * @throws Exception 
 */
public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        throw new IllegalArgumentException("Usage: <inpath> <outpath>");
    }
    
    ToolRunner.run(new Configuration(), new SecondarySortMapReduce(), args);
}

}
1.3 Test

Operating environment

Operating system: Centos 6.4
Hadoop: Apache Hadoop-2.5.0
Take the example above as test data

Account amount
hadoop@apache 200
hive@apache 550
yarn@apache 580
hive@apache 159
hadoop@apache 300
hive@apache 258
hadoop@apache 300
yarn@apache 100
hadoop@apache 150
yarn@apache 560
yarn@apache 260

Topics: Apache Hadoop hive Java