7장. 맵리듀스 정렬구현


  • 보조정렬 - 복합키 → 복합키 비교기 → 파티셔너 → 그룹키 비교기


① 복합키 구현 - DataKey.java
- 복합키는 기존의 키값을 조합한 일종의 키 집합 클래스.
- 복합키를 적용하면 연도와 월이 각각 멤버변수로 정의된다.
[출처] 정렬 구현하기1|작성자 앨리스
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
package pj01.hadoop.chapter06;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
public class DateKey implements WritableComparable<DateKey> {
    private String year;
    private Integer month;
    public DateKey() {
    }
    public DateKey(String year, Integer date) {
        this.year = year;
        this.month = date;
    }
    public String getYear() {
        return year;
    }
    public void setYear(String year) {
        this.year = year;
    }
    public Integer getMonth() {
        return month;
    }
    public void setMonth(Integer month) {
        this.month = month;
    }
    @Override
    public String toString() {
        return (new StringBuilder()).append(year).append(",").append(month)
                .toString();
    }
    @Override
    public void readFields(DataInput inthrows IOException {
        year = WritableUtils.readString(in);
        month = in.readInt();
    }
    @Override
    public void write(DataOutput outthrows IOException {
        WritableUtils.writeString(out, year);
    out.writeInt(month);
    }
    @Override
    public int compareTo(DateKey key) {
        int result = year.compareTo(key.year);
        if (0 == result) {
            result = month.compareTo(key.month);
        }
        return result;
        // if (this.year.compareTo(key.year) != 0) {
        // return this.year.compareTo(key.year);
        // } else if (this.distance != key.distance) {
        // return this.distance < key.distance ? -1 : 1;
        // } else {
        // return 0;
        // }
    }
}
cs

② 복합키 비교기 구현 - DateKeycomparator.java
- 복합키의 정렬 순서를 부여하기 위한 클래스.
(두 개의 복합키를 비교하게 되며, 각 멤버 변수를 비교해 정렬 순서를 정한다)
[출처] 정렬 구현하기1|작성자 앨리스
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
package pj01.hadoop.chapter06;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class DateKeyComparator extends WritableComparator {
    protected DateKeyComparator() {
        super(DateKey.classtrue);
    }
    @SuppressWarnings("rawtypes")
    @Override
    public int compare(WritableComparable w1, WritableComparable w2) {
        DateKey k1 = (DateKey) w1;
        DateKey k2 = (DateKey) w2;
        int cmp = k1.getYear().compareTo(k2.getYear());
        if (cmp != 0) {
            return cmp;
        }
        return k1.getMonth() == k2.getMonth() ? 0 : (k1.getMonth() < k2
                .getMonth() ? -1 : 1);
    }
}
cs

③ 그룹키 파티셔너 구현 - GroupKeyPartitioner.java
- 파티셔너는 맵 태스크의 출력 데이터를 리듀스 태스크의 입력 데이터로 보낼지 결정하고, 파티셔닝된 데이터는 맵 태스크의 출력 데이터의 키의 값에 따라 정렬.
- 이번에 구현하는 그룹키 파티셔너는 그룹핑 키로 사용하는 연도에 대한 파티셔닝을 수행하게 됨.
[출처] 정렬 구현하기1|작성자 앨리스
1
2
3
4
5
6
7
8
9
10
11
12
13
14
package pj01.hadoop.chapter06;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class GroupKeyPartitioner extends Partitioner<DateKey, IntWritable> {
    @Override
    public int getPartition(DateKey key, IntWritable val, int numPartitions) {
        int hash = key.getYear().hashCode();
        int partition = hash % numPartitions;
        return partition;
    }
cs

④ 그룹키 비교기 구현 - GroupKeyComparator.java
- 리듀서는 그룹키 비교기를 사용해 같은 연도에 해당하는 모든 데이터를 하나의 Reducer 그룹에서 처리.
[출처] 정렬 구현하기1|작성자 앨리스
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
package pj01.hadoop.chapter06;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class GroupKeyComparator extends WritableComparator {
    protected GroupKeyComparator() {
        super(DateKey.classtrue);
    }
    @SuppressWarnings("rawtypes")
    @Override
    public int compare(WritableComparable w1, WritableComparable w2) {
        DateKey k1 = (DateKey) w1;
        DateKey k2 = (DateKey) w2;
        return k1.getYear().compareTo(k2.getYear());
    }
}
cs

⑤ 매퍼 구현
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
package pj01.hadoop.chapter06;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import wikibooks.hadoop.chapter05.DelayCounters;
public class DelayCountMapperWithDateKey extends
        Mapper<LongWritable, Text, DateKey, IntWritable> {
    private final static IntWritable outputValue = new IntWritable(1);
    private DateKey outputKey = new DateKey();
    public void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        if (key.get() > 0) {
            String[] colums = value.toString().split(",");
            if (colums != null && colums.length > 0) {
                try {
                    if (!colums[15].equals("NA")) {
                        int depDelayTime = Integer.parseInt(colums[15]);
                        if (depDelayTime > 0) {
                            outputKey.setYear("D," + colums[0]);
                            outputKey.setMonth(new Integer(colums[1]));
                            context.write(outputKey, outputValue);
                        } else if (depDelayTime == 0) {
                            context.getCounter(
                                    DelayCounters.scheduled_departure)
                                    .increment(1);
                        } else if (depDelayTime < 0) {
                            context.getCounter(DelayCounters.early_departure)
                                    .increment(1);
                        }
                    } else {
                        context.getCounter(
                                DelayCounters.not_available_departure)
                                .increment(1);
                    }
                    if (!colums[14].equals("NA")) {
                        int arrDelayTime = Integer.parseInt(colums[14]);
                        if (arrDelayTime > 0) {
                            outputKey.setYear("A," + colums[0]);
                            outputKey.setMonth(new Integer(colums[1]));
                            context.write(outputKey, outputValue);
                        } else if (arrDelayTime == 0) {
                            context.getCounter(DelayCounters.scheduled_arrival)
                                    .increment(1);
                        } else if (arrDelayTime < 0) {
                            context.getCounter(DelayCounters.early_arrival)
                                    .increment(1);
                        }
                    } else {
                        context.getCounter(DelayCounters.not_available_arrival)
                                .increment(1);
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }
}
cs

⑥ 리듀서 구현
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
package pj01.hadoop.chapter06;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
public class DelayCountReducerWithDateKey extends
        Reducer<DateKey, IntWritable, DateKey, IntWritable> {
    private MultipleOutputs<DateKey, IntWritable> mos;
    private DateKey outputKey = new DateKey();
    private IntWritable result = new IntWritable();
    @Override
    public void setup(Context context) throws IOException, InterruptedException {
        mos = new MultipleOutputs<DateKey, IntWritable>(context);
    }
    public void reduce(DateKey key, Iterable<IntWritable> values,
            Context context) throws IOException, InterruptedException {
        String[] colums = key.getYear().split(",");
        int sum = 0;
        Integer bMonth = key.getMonth();
        if (colums[0].equals("D")) {
            for (IntWritable value : values) {
                if (bMonth != key.getMonth()) {
                    result.set(sum);
                    outputKey.setYear(key.getYear().substring(2));
                    outputKey.setMonth(bMonth);
                    mos.write("departure", outputKey, result);
                    sum = 0; //합계 초기화
                }
                sum += value.get();
                bMonth = key.getMonth();
            }
//해당 년도의 12월 데이터 출력
            if (key.getMonth() == bMonth) {
                outputKey.setYear(key.getYear().substring(2));
                outputKey.setMonth(key.getMonth());
                result.set(sum);
                mos.write("departure", outputKey, result);
            }
        } else {
            for (IntWritable value : values) {
                if (bMonth != key.getMonth()) {
                    result.set(sum);
                    outputKey.setYear(key.getYear().substring(2));
                    outputKey.setMonth(bMonth);
                    mos.write("arrival", outputKey, result);
                    sum = 0;
                }
                sum += value.get();
                bMonth = key.getMonth();
            }
            if (key.getMonth() == bMonth) {
                outputKey.setYear(key.getYear().substring(2));
                outputKey.setMonth(key.getMonth());
                result.set(sum);
                mos.write("arrival", outputKey, result);
            }
        }
    }
    @Override
    public void cleanup(Context context) throws IOException,
            InterruptedException {
        mos.close();
    }
}
cs

⑦ 드라이버 구현 - DelayCountWithDateKey.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package wpj01.hadoop.chapter06;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class DelayCountWithDateKey extends Configured implements Tool {
    public int run(String[] args) throws Exception {
        String[] otherArgs = new GenericOptionsParser(getConf(), args)
                .getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: DelayCountWithDateKey <in> <out>");
            System.exit(2);
        }
 // Job 이름 설정
        Job job = new Job(getConf(), "DelayCountWithDateKey");
 // 입출력 데이터 경로 설정
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
 // Job 클래스 설정
        job.setJarByClass(DelayCountWithDateKey.class);
        job.setPartitionerClass(GroupKeyPartitioner.class);
        job.setGroupingComparatorClass(GroupKeyComparator.class);
        job.setSortComparatorClass(DateKeyComparator.class);
 // Mapper 클래스 설정
        job.setMapperClass(DelayCountMapperWithDateKey.class);
        job.setReducerClass(DelayCountReducerWithDateKey.class);
 // Reducer 클래스 설정
        job.setMapOutputKeyClass(DateKey.class);
        job.setMapOutputValueClass(IntWritable.class);
 // 입출력 데이터 포맷 설정
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
 // 출력키 및 출력값 유형 설정
        job.setOutputKeyClass(DateKey.class);
        job.setOutputValueClass(IntWritable.class);
 // MultipleOutputs 설정
        MultipleOutputs.addNamedOutput(job, "departure",
                TextOutputFormat.class, DateKey.class, IntWritable.class);
        MultipleOutputs.addNamedOutput(job, "arrival", TextOutputFormat.class,
                DateKey.class, IntWritable.class);
        job.waitForCompletion(true);
        return 0;
    }
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(),
                new DelayCountWithDateKey(), args); // Tool 인터페이스 실행
        System.out.println("## RESULT:" + res);
    }
}
cs

⑧ 드라이버 실행
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
[hduser@hdstudy01 hadoop]$ hadoop jar alzio-hadoop-examples.jar input delay_count_sort
17/06/13 15:50:15 INFO input.FileInputFormat: Total input paths to process : 22
17/06/13 15:50:15 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/06/13 15:50:15 WARN snappy.LoadSnappy: Snappy native library not loaded
17/06/13 15:50:16 INFO mapred.JobClient: Running job: job_201706091514_0009
17/06/13 15:50:17 INFO mapred.JobClient:  map 0% reduce 0%
17/06/13 15:50:41 INFO mapred.JobClient:  map 1% reduce 0%
(중략)
17/06/13 16:28:10 INFO mapred.JobClient:  map 100% reduce 93%
17/06/13 16:28:16 INFO mapred.JobClient:  map 100% reduce 94%
17/06/13 16:28:19 INFO mapred.JobClient:  map 100% reduce 95%
17/06/13 16:28:25 INFO mapred.JobClient:  map 100% reduce 96%
17/06/13 16:28:31 INFO mapred.JobClient:  map 100% reduce 97%
17/06/13 16:28:34 INFO mapred.JobClient:  map 100% reduce 98%
17/06/13 16:28:40 INFO mapred.JobClient:  map 100% reduce 99%
17/06/13 16:28:42 INFO mapred.JobClient:  map 100% reduce 100%
17/06/13 16:28:44 INFO mapred.JobClient: Job complete: job_201706091514_0009
17/06/13 16:28:44 INFO mapred.JobClient: Counters: 35
17/06/13 16:28:44 INFO mapred.JobClient:   Map-Reduce Framework
17/06/13 16:28:44 INFO mapred.JobClient:     Spilled Records=386569227
17/06/13 16:28:44 INFO mapred.JobClient:     Map output materialized bytes=2143238676
17/06/13 16:28:44 INFO mapred.JobClient:     Reduce input records=107161878
17/06/13 16:28:44 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=361072050176
17/06/13 16:28:44 INFO mapred.JobClient:     Map input records=122846665
17/06/13 16:28:44 INFO mapred.JobClient:     SPLIT_RAW_BYTES=21018
17/06/13 16:28:44 INFO mapred.JobClient:     Map output bytes=1928913804
17/06/13 16:28:44 INFO mapred.JobClient:     Reduce shuffle bytes=2143238676
17/06/13 16:28:44 INFO mapred.JobClient:     Physical memory (bytes) snapshot=34841272320
17/06/13 16:28:44 INFO mapred.JobClient:     Reduce input groups=44
17/06/13 16:28:44 INFO mapred.JobClient:     Combine output records=0
17/06/13 16:28:44 INFO mapred.JobClient:     Reduce output records=0
17/06/13 16:28:44 INFO mapred.JobClient:     Map output records=107161878
17/06/13 16:28:44 INFO mapred.JobClient:     Combine input records=0
17/06/13 16:28:44 INFO mapred.JobClient:     CPU time spent (ms)=1693820
17/06/13 16:28:44 INFO mapred.JobClient:     Total committed heap usage (bytes)=24400076800
17/06/13 16:28:44 INFO mapred.JobClient:   pj01.hadoop.chapter07.DelayCounters
17/06/13 16:28:44 INFO mapred.JobClient:     scheduled_arrival=5185011
17/06/13 16:28:44 INFO mapred.JobClient:     early_arrival=57602824
17/06/13 16:28:44 INFO mapred.JobClient:     not_available_departure=2293613
17/06/13 16:28:44 INFO mapred.JobClient:     early_departure=44612589
17/06/13 16:28:44 INFO mapred.JobClient:     scheduled_departure=26259648
17/06/13 16:28:44 INFO mapred.JobClient:     not_available_arrival=2577721
17/06/13 16:28:44 INFO mapred.JobClient:   File Input Format Counters
17/06/13 16:28:44 INFO mapred.JobClient:     Bytes Read=11962658585
17/06/13 16:28:44 INFO mapred.JobClient:   FileSystemCounters
17/06/13 16:28:44 INFO mapred.JobClient:     HDFS_BYTES_READ=11962679603
17/06/13 16:28:44 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=7742736688
17/06/13 16:28:44 INFO mapred.JobClient:     FILE_BYTES_READ=5588150148
17/06/13 16:28:44 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=7239
17/06/13 16:28:44 INFO mapred.JobClient:   Job Counters
17/06/13 16:28:44 INFO mapred.JobClient:     Launched map tasks=186
17/06/13 16:28:44 INFO mapred.JobClient:     Launched reduce tasks=1
17/06/13 16:28:44 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=2187300
17/06/13 16:28:44 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
17/06/13 16:28:44 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=4163490
17/06/13 16:28:44 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
17/06/13 16:28:44 INFO mapred.JobClient:     Data-local map tasks=186
17/06/13 16:28:44 INFO mapred.JobClient:   File Output Format Counters
17/06/13 16:28:44 INFO mapred.JobClient:     Bytes Written=0
## RESULT:0
cs

* 결과확인
- 정렬결과
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
[hduser@hdstudy01 hadoop]$ hadoop fs -cat delay_count_sort/arrival-r-00000 | head -20
1987,10 265658
1987,11 255127
1987,12 287408
1988,1  261810
1988,2  242219
1988,3  255083
1988,4  219288
1988,5  221071
1988,6  215385
1988,7  224274
1988,8  227943
1988,9  204558
1988,10 230876
1988,11 237343
1988,12 249340
1989,1  239136
1989,2  231073
1989,3  251148
1989,4  207190
1989,5  227174
[hduser@hdstudy01 hadoop]$ hadoop fs -cat delay_count_sort/departure-r-00000 | head -20
1987,10 175568
1987,11 177218
1987,12 218858
1988,1  198610
1988,2  177939
1988,3  187141
1988,4  159216
1988,5  164107
1988,6  165596
1988,7  174844
1988,8  175591
1988,9  138322
1988,10 162211
1988,11 175123
1988,12 189137
1989,1  178161
1989,2  181324
1989,3  204720
1989,4  157890
1989,5  170654
cs

- 기존 Job reduce 입력그룹수 : 254
- 연도별 그룹정렬 reduce 입력르룹수 :  44


  • 부분정렬
* 부분정렬 작동방식
- Mapper의 출력 데이터를 Mapfile 형태로 변경해서 데이터를 검색하는 방법
- Maptask가 실행될 때 Partitioner는 Mapper의 출력 데이터가 어떤 Reduce task로 전달될지 결정하게됨.
- Partitioning된 데이터는 키에 따라 정렬
- 부분정렬하기 위해서는 Partitioning된 출력 데이터를 MapFile로 변경해야함.
- 그리고, 특정 키에 대한 데이터를 검색할 때 해당 키에 대한 데이터가 저장되어 있는 Mapfile에 접근해서 데이터를 조회함.
- 데이터 검색엣 자주 이용하는 기술

* 순서
- 입력데이터를 sequenceFile로 생성 한뒤에, sequenceFile을 Mapfile로 변경
- 그리고, Mapfile에서 데이터를 검색하는 프로그램 실행


* 시퀀스 파일 생성 - SequencefileCreator.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package pj01.hadoop.chapter06;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SequenceFileCreator extends Configured implements Tool {
    static class DistanceMapper extends MapReduceBase implements
            Mapper<LongWritable, Text, IntWritable, Text> {
        private IntWritable outputKey = new IntWritable();
        private int distance = 0;
        public void map(LongWritable key, Text value,
                OutputCollector<IntWritable, Text> output, Reporter reporter)
                throws IOException {
//Offset으로 cvs형태의 파일을 지정할 것이기 때문에 LongWritable로 설정
// 출력값은 운항거리, 키가 운항거리, IntWritalbe로 지정
            if (key.get() > 0) {
                try {
                    String[] colums = value.toString().split(",");
                    if (colums.length > 0) {
                        if (!colums[18].equals("NA"&& !colums[18].equals("")) {
                            distance = Integer.parseInt(colums[18]);
                        }
                        outputKey.set(distance);
// colums에서 항공운항거리를 조회해서 출력데이터의 키로 설정
                        output.collect(outputKey, value);
// 콤마(,)가 들어있던 원래의 라인을 그래도 출력
                    }
// 데이터가 누락되어 있을 수 있으므로 Exception처리해줘야함.
                } catch (ArrayIndexOutOfBoundsException ae) {
                    outputKey.set(0);
                    output.collect(outputKey, value);
                    ae.printStackTrace();
                } catch (Exception e) {
                    outputKey.set(0);
                    output.collect(outputKey, value);
                    e.printStackTrace();
                }
            }
        }
    }
    public int run(String[] args) throws Exception {
//Job 환경정보 설정
        JobConf conf = new JobConf(SequenceFileCreator.class);
        conf.setJobName("SequenceFileCreator");

conf.setMapperClass(DistanceMapper.class);
        conf.setNumReduceTasks(0); // 리듀서가 필요없으므로, 0으로 설정

FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

//출력포맷 sequencefile, 출력키값은 IntWritable, 출력값은 Text로 설정
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(Text.class);
//출력 압축 포맷 설정
        SequenceFileOutputFormat.setCompressOutput(conf, true);
        SequenceFileOutputFormat
                .setOutputCompressorClass(conf, GzipCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(conf,
                CompressionType.BLOCK);
        JobClient.runJob(conf);
        return 0;
    }
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(),
                new SequenceFileCreator(), args);
        System.out.println("## RESULT:" + res);
    }
}
cs
- Mapper는 내부 Class형태로 구현됨.
- Mapper는 입력데이터를 연산하지 않기 때문에 Reducer는 구현하지 않음.
- Mapred 패키지를 이용해서 부분정렬을 구현해야함.
  → 이유 : hadoop.MapReduces.job에서는 org.apache.hadoop.Mapred.MapFileOutputFormat이라는 것을 출력데이터로 쓸수 없기 때문

- 실행
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
[hduser@hdstudy01 hadoop]$ hadoop jar alzio-hadoop-examples.jar  input/2000.csv 2000_seq_ile
17/06/13 16:53:31 WARN mapred.JobClient: Use GenericOptionsParser for parsing the argumens. Applications should implement Tool for the same.
17/06/13 16:53:31 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/06/13 16:53:31 WARN snappy.LoadSnappy: Snappy native library not loaded
17/06/13 16:53:31 INFO mapred.FileInputFormat: Total input paths to process : 1
17/06/13 16:53:32 INFO mapred.JobClient: Running job: job_201706091514_0010
17/06/13 16:53:33 INFO mapred.JobClient:  map 0% reduce 0%
17/06/13 16:53:49 INFO mapred.JobClient:  map 5% reduce 0%
17/06/13 16:53:52 INFO mapred.JobClient:  map 10% reduce 0%
17/06/13 16:53:55 INFO mapred.JobClient:  map 15% reduce 0%
17/06/13 16:53:58 INFO mapred.JobClient:  map 21% reduce 0%
(중략)
17/06/13 16:54:59 INFO mapred.JobClient:  map 76% reduce 0%
17/06/13 16:55:02 INFO mapred.JobClient:  map 82% reduce 0%
17/06/13 16:55:05 INFO mapred.JobClient:  map 85% reduce 0%
17/06/13 16:55:08 INFO mapred.JobClient:  map 87% reduce 0%
17/06/13 16:55:09 INFO mapred.JobClient:  map 88% reduce 0%
17/06/13 16:55:17 INFO mapred.JobClient:  map 100% reduce 0%
17/06/13 16:55:19 INFO mapred.JobClient: Job complete: job_201706091514_0010
17/06/13 16:55:19 INFO mapred.JobClient: Counters: 20
17/06/13 16:55:19 INFO mapred.JobClient:   Map-Reduce Framework
17/06/13 16:55:19 INFO mapred.JobClient:     Spilled Records=0
17/06/13 16:55:19 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=17396047872
17/06/13 16:55:19 INFO mapred.JobClient:     Map input records=5683048
17/06/13 16:55:19 INFO mapred.JobClient:     SPLIT_RAW_BYTES=909
17/06/13 16:55:19 INFO mapred.JobClient:     Map output records=5683047
17/06/13 16:55:19 INFO mapred.JobClient:     Physical memory (bytes) snapshot=700948480
17/06/13 16:55:19 INFO mapred.JobClient:     Map input bytes=570151613
17/06/13 16:55:19 INFO mapred.JobClient:     CPU time spent (ms)=67450
17/06/13 16:55:19 INFO mapred.JobClient:     Total committed heap usage (bytes)=273678336
17/06/13 16:55:19 INFO mapred.JobClient:   File Input Format Counters
17/06/13 16:55:19 INFO mapred.JobClient:     Bytes Read=570184381
17/06/13 16:55:19 INFO mapred.JobClient:   FileSystemCounters
17/06/13 16:55:19 INFO mapred.JobClient:     HDFS_BYTES_READ=570185290
17/06/13 16:55:19 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=510327
17/06/13 16:55:19 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=116428966
17/06/13 16:55:19 INFO mapred.JobClient:   File Output Format Counters
17/06/13 16:55:19 INFO mapred.JobClient:     Bytes Written=116428966
17/06/13 16:55:19 INFO mapred.JobClient:   Job Counters
17/06/13 16:55:19 INFO mapred.JobClient:     Launched map tasks=9
17/06/13 16:55:19 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=0
17/06/13 16:55:19 INFO mapred.JobClient:     Total time spent by all reduces waiting afte reserving slots (ms)=0
17/06/13 16:55:19 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=196704
17/06/13 16:55:19 INFO mapred.JobClient:     Total time spent by all maps waiting after rserving slots (ms)=0
17/06/13 16:55:19 INFO mapred.JobClient:     Data-local map tasks=9
## RESULT:0
[hduser@hdstudy01 hadoop]$
cs
Map input records=5683048
Map output records=5683047
숫자가 하나 줄어든 이유는 컬럼명이 제거 되었기 때문


- 결과확인
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[hduser@hdstudy01 hadoop]$ hadoop fs -lsr 2000_seq_file
-rw-r--r--   1 hduser supergroup          0 2017-06-13 16:55 /user/hduser/2000_seq_file/_SUCCESS
drwxr-xr-x   - hduser supergroup          0 2017-06-13 16:53 /user/hduser/2000_seq_file/_logs
drwxr-xr-x   - hduser supergroup          0 2017-06-13 16:53 /user/hduser/2000_seq_file/_logs/history
-rw-r--r--   1 hduser supergroup      32135 2017-06-13 16:53 /user/hduser/2000_seq_file/_logs/history/job_201706091514_0010_1497340412393_hduser_SequenceFileCreator
-rw-r--r--   1 hduser supergroup      48963 2017-06-13 16:53 /user/hduser/2000_seq_file/_logs/history/job_201706091514_0010_conf.xml
-rw-r--r--   1 hduser supergroup   13649408 2017-06-13 16:53 /user/hduser/2000_seq_file/part-00000
-rw-r--r--   1 hduser supergroup   13653803 2017-06-13 16:53 /user/hduser/2000_seq_file/part-00001
-rw-r--r--   1 hduser supergroup   13665636 2017-06-13 16:54 /user/hduser/2000_seq_file/part-00002
-rw-r--r--   1 hduser supergroup   13848570 2017-06-13 16:54 /user/hduser/2000_seq_file/part-00003
-rw-r--r--   1 hduser supergroup   13720965 2017-06-13 16:54 /user/hduser/2000_seq_file/part-00004
-rw-r--r--   1 hduser supergroup   13684583 2017-06-13 16:54 /user/hduser/2000_seq_file/part-00005
-rw-r--r--   1 hduser supergroup   13635757 2017-06-13 16:54 /user/hduser/2000_seq_file/part-00006
-rw-r--r--   1 hduser supergroup   13679743 2017-06-13 16:54 /user/hduser/2000_seq_file/part-00007
-rw-r--r--   1 hduser supergroup    6890501 2017-06-13 16:55 /user/hduser/2000_seq_file/part-00008
[hduser@hdstudy01 hadoop]$
cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
[hduser@hdstudy01 hadoop]$ hadoop fs -text 2000_seq_file/part-00000 | head -20
17/06/13 17:00:31 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/06/13 17:00:31 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
17/06/13 17:00:31 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 17:00:31 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 17:00:31 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 17:00:31 INFO compress.CodecPool: Got brand-new decompressor
1587    2000,1,28,5,1647,1647,1906,1859,HP,154,N808AW,259,252,233,7,0,ATL,PHX,1587,15,11,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,29,6,1648,1647,1939,1859,HP,154,N653AW,291,252,239,40,1,ATL,PHX,1587,5,47,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,30,7,NA,1647,NA,1859,HP,154,N801AW,NA,252,NA,NA,NA,ATL,PHX,1587,0,0,1,NA,0,NA,NA,NA,NA,NA
1587    2000,1,31,1,1645,1647,1852,1859,HP,154,N806AW,247,252,226,-7,-2,ATL,PHX,1587,7,14,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,1,6,842,846,1057,1101,HP,609,N158AW,255,255,244,-4,-4,ATL,PHX,1587,3,8,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,2,7,849,846,1148,1101,HP,609,N656AW,299,255,267,47,3,ATL,PHX,1587,8,24,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,3,1,844,846,1121,1101,HP,609,N803AW,277,255,244,20,-2,ATL,PHX,1587,6,27,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,1,6,1702,1657,1912,1908,HP,611,N652AW,250,251,232,4,5,ATL,PHX,1587,5,13,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,2,7,1658,1657,1901,1908,HP,611,N807AW,243,251,233,-7,1,ATL,PHX,1587,3,7,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,3,1,1656,1657,1922,1908,HP,611,N807AW,266,251,241,14,-1,ATL,PHX,1587,5,20,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,4,2,1955,1932,2230,2153,HP,613,N509DC,275,261,232,37,23,ATL,PHX,1587,5,38,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,5,3,1934,1932,2133,2153,HP,613,N509DC,239,261,224,-20,2,ATL,PHX,1587,5,10,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,6,4,1929,1932,2125,2153,HP,613,N303AW,236,261,220,-28,-3,ATL,PHX,1587,5,11,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,7,5,1932,1932,2146,2153,HP,613,N173AW,254,261,237,-7,0,ATL,PHX,1587,4,13,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,9,7,2008,1932,2221,2153,HP,613,N168AW,253,261,237,28,36,ATL,PHX,1587,4,12,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,10,1,1926,1932,2147,2153,HP,613,N160AW,261,261,235,-6,-6,ATL,PHX,1587,7,19,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,11,2,1932,1932,2126,2153,HP,613,N160AW,234,261,217,-27,0,ATL,PHX,1587,6,11,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,12,3,1936,1932,2142,2153,HP,613,N322AW,246,261,227,-11,4,ATL,PHX,1587,7,12,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,13,4,1942,1932,2153,2153,HP,613,N160AW,251,261,220,0,10,ATL,PHX,1587,5,26,0,NA,0,NA,NA,NA,NA,NA
1587    2000,1,14,5,1932,1932,2131,2153,HP,613,N314AW,239,261,218,-22,0,ATL,PHX,1587,6,15,0,NA,0,NA,NA,NA,NA,NA
text: Unable to write to output stream.
[hduser@hdstudy01 hadoop]$
cs

* 맵파일 생성
- MapFile : 키 값을 검색할 수 있도록 색인과 함께 정렬된 SequenceFile
- 물리적인 색인이 저장된 Index 파일과 데이터 내용이 저장된 데이터 파일로 구성
- SequenceFilCreator에서 생성한  squenceFile 을 입력 받아 변환한 뒤 MapFile생성
- 데이터 분석이 필요없는 Driver Class이므로 Mapper와 Reducer는 설정하지 않음
- JobClient는 Mapper와 Reducer를 따로 설정하지 않을 시, 디폴트로 org.apache.hadoop에 Mapred.Mapper와 Mapred.Reducer로 자동설정함.


- 구현
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
package pj01.hadoop.chapter06;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MapFileCreator extends Configured implements Tool {
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(MapFileCreator.class);
        conf.setJobName("MapFileCreator");
 // 입출력 경로 설정
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));
 // SequenceFile을 입력데이터로 설정
        conf.setInputFormat(SequenceFileInputFormat.class);
// MapFile생성을 위해 출력포맷은 MapFileOutputFormat로 설정
        conf.setOutputFormat(MapFileOutputFormat.class);
// 출력키값은 항공운항 거리로서, IntWritable로 설정
        conf.setOutputKeyClass(IntWritable.class);
        SequenceFileOutputFormat.setCompressOutput(conf, true);
        SequenceFileOutputFormat
                .setOutputCompressorClass(conf, GzipCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(conf,
                CompressionType.BLOCK);
        JobClient.runJob(conf);
        return 0;
    }
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new MapFileCreator(),
                args);
        System.out.println("## RESULT:" + res);
    }
}
cs

- 실행
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
[hduser@hdstudy01 hadoop]$ hadoop jar alzio-hadoop-examples.jar  2000_seq_file 2000_map_file
17/06/13 17:26:52 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
17/06/13 17:26:52 INFO mapred.FileInputFormat: Total input paths to process : 9
17/06/13 17:26:52 INFO mapred.JobClient: Running job: job_201706091514_0011
17/06/13 17:26:53 INFO mapred.JobClient:  map 0% reduce 0%
17/06/13 17:27:08 INFO mapred.JobClient:  map 19% reduce 0%
17/06/13 17:27:11 INFO mapred.JobClient:  map 22% reduce 0%
17/06/13 17:27:25 INFO mapred.JobClient:  map 42% reduce 0%
(중략)
17/06/13 17:28:47 INFO mapred.JobClient:  map 100% reduce 95%
17/06/13 17:28:50 INFO mapred.JobClient:  map 100% reduce 97%
17/06/13 17:28:53 INFO mapred.JobClient:  map 100% reduce 99%
17/06/13 17:28:54 INFO mapred.JobClient:  map 100% reduce 100%
17/06/13 17:28:56 INFO mapred.JobClient: Job complete: job_201706091514_0011
17/06/13 17:28:56 INFO mapred.JobClient: Counters: 30
17/06/13 17:28:56 INFO mapred.JobClient:   Map-Reduce Framework
17/06/13 17:28:56 INFO mapred.JobClient:     Spilled Records=17049141
17/06/13 17:28:56 INFO mapred.JobClient:     Map output materialized bytes=604249649
17/06/13 17:28:56 INFO mapred.JobClient:     Reduce input records=5683047
17/06/13 17:28:56 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=19352600576
17/06/13 17:28:56 INFO mapred.JobClient:     Map input records=5683047
17/06/13 17:28:56 INFO mapred.JobClient:     SPLIT_RAW_BYTES=999
17/06/13 17:28:56 INFO mapred.JobClient:     Map output bytes=592883501
17/06/13 17:28:56 INFO mapred.JobClient:     Reduce shuffle bytes=604249649
17/06/13 17:28:56 INFO mapred.JobClient:     Physical memory (bytes) snapshot=1750589440
17/06/13 17:28:56 INFO mapred.JobClient:     Map input bytes=116427841
17/06/13 17:28:56 INFO mapred.JobClient:     Reduce input groups=1110
17/06/13 17:28:56 INFO mapred.JobClient:     Combine output records=0
17/06/13 17:28:56 INFO mapred.JobClient:     Reduce output records=5683047
17/06/13 17:28:56 INFO mapred.JobClient:     Map output records=5683047
17/06/13 17:28:56 INFO mapred.JobClient:     Combine input records=0
17/06/13 17:28:56 INFO mapred.JobClient:     CPU time spent (ms)=81670
17/06/13 17:28:56 INFO mapred.JobClient:     Total committed heap usage (bytes)=1236570112
17/06/13 17:28:56 INFO mapred.JobClient:   File Input Format Counters
17/06/13 17:28:56 INFO mapred.JobClient:     Bytes Read=116428966
17/06/13 17:28:56 INFO mapred.JobClient:   FileSystemCounters
17/06/13 17:28:56 INFO mapred.JobClient:     HDFS_BYTES_READ=116429965
17/06/13 17:28:56 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=1813314010
17/06/13 17:28:56 INFO mapred.JobClient:     FILE_BYTES_READ=1208499400
17/06/13 17:28:56 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=115851767
17/06/13 17:28:56 INFO mapred.JobClient:   File Output Format Counters
17/06/13 17:28:56 INFO mapred.JobClient:     Bytes Written=115851767
17/06/13 17:28:56 INFO mapred.JobClient:   Job Counters
17/06/13 17:28:56 INFO mapred.JobClient:     Launched map tasks=9
17/06/13 17:28:56 INFO mapred.JobClient:     Launched reduce tasks=1
17/06/13 17:28:56 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=103133
17/06/13 17:28:56 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
17/06/13 17:28:56 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=134530
17/06/13 17:28:56 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
17/06/13 17:28:56 INFO mapred.JobClient:     Data-local map tasks=9
## RESULT:0
cs

- 결과확인
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
[hduser@hdstudy01 hadoop]$ hadoop fs -ls 2000_map_file
Found 3 items
-rw-r--r--   1 hduser supergroup          0 2017-06-13 17:28 /user/hduser/2000_map_file/_SUCCESS
drwxr-xr-x   - hduser supergroup          0 2017-06-13 17:26 /user/hduser/2000_map_file/_logs
drwxr-xr-x   - hduser supergroup          0 2017-06-13 17:28 /user/hduser/2000_map_file/part-00000
[hduser@hdstudy01 hadoop]$
[hduser@hdstudy01 hadoop]$ hadoop fs -text 2000_map_file/part-00000/data | head -10
17/06/13 17:33:26 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/06/13 17:33:26 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
17/06/13 17:33:26 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 17:33:26 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 17:33:26 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 17:33:26 INFO compress.CodecPool: Got brand-new decompressor
11      2000,6,28,3,NA,0,NA,0,AA,1436,UNKNOW,NA,NA,NA,NA,NA,JFK,LGA,11,0,0,1,NA,0,NA,NA,NA,NA,NA
17      2000,3,9,4,1052,1030,1127,1108,US,940,N622AU,35,38,12,19,22,EWR,LGA,17,6,17,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,30,4,204,150,248,215,DL,1490,N637DL,44,25,10,33,14,MIA,FLL,21,5,29,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,26,7,146,150,214,215,DL,1490,N642DL,28,25,11,-1,-4,MIA,FLL,21,6,11,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,29,3,130,150,157,215,DL,1490,N690DL,27,25,12,-18,-20,MIA,FLL,21,6,9,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,27,1,145,150,223,215,DL,1490,N626DL,38,25,21,8,-5,MIA,FLL,21,6,11,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,28,2,126,150,151,215,DL,1490,N624DL,25,25,11,-24,-24,MIA,FLL,21,5,9,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,31,5,149,150,220,215,DL,1490,N626DL,31,25,13,5,-1,MIA,FLL,21,6,12,0,NA,0,NA,NA,NA,NA,NA
21      2000,5,29,1,200,150,233,217,DL,1490,N624DL,33,27,14,16,10,MIA,FLL,21,8,11,0,NA,0,NA,NA,NA,NA,NA
21      2000,5,30,2,132,150,202,217,DL,1490,N695DA,30,27,12,-15,-18,MIA,FLL,21,7,11,0,NA,0,NA,NA,NA,NA,NA
text: Unable to write to output stream.
[hduser@hdstudy01 hadoop]$ hadoop fs -text 2000_map_file/part-00000/index | head -10
17/06/13 17:33:35 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/06/13 17:33:35 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
17/06/13 17:33:35 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 17:33:35 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 17:33:35 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 17:33:35 INFO compress.CodecPool: Got brand-new decompressor
11      125
21      125
24      125
24      125
24      125
24      125
24      125
24      125
24      125
24      125
text: Unable to write to output stream.
cs

* 검색 프로그램 구현
- 검색의 키는 Partitioner이고, 검색하고자 하는 키가 속하는 Partition 번호러 조회한 후에, Partition 번호로 MapFile에 접근해서 데이터를 검색.
- FileSystem은 하둡에서 제공하는 FileSystem 추상화 Class임. 로컬파일시스템이나 HDFS나 어떤 파일시스템을 이용하든지, 반드시 FileSystem Class로 파일에 접근해야함.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
package pj01.hadoop.chapter06;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.MapFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SearchValueList extends Configured implements Tool {
    public int run(String[] args) throws Exception {
        Path path = new Path(args[0]);
        FileSystem fs = path.getFileSystem(getConf());

 // 해당 폴더에 저장된 MapFile 조회
// getReaders메소드는 org.apache.hadoop.io.MapFile.Reader배열 반환
        Reader[] readers = MapFileOutputFormat.getReaders(fs, path, getConf());

  // MapFile의 키는 운항거리이기 때문에 사용자가 입력한 운항 거리를 IntWritalbe객체로 생성
        IntWritable key = new IntWritable();
        key.set(Integer.parseInt(args[1]));
        Text value = new Text();
 // Mapfile에 접근하려면 Partition정보가 필요. HashPartitioner생성.
// 두번째 단계에서 만들어진 MapFile이 HashParitioning되었기 때문.
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
// Reader는 Mapfile에 저장된 데이터 목록을 순회하면서 데이터를 조회
// getPartintion메소드로 주어진 키에 대한 Partition번호를 반환받음
// 앞에서 생성한 Readers 배열에서 이 Partition번호에 해당되는 Reader를 조회하는것.
 Reader reader = readers[partitioner.getPartition(key, value,
                readers.length)];
 // 주어진 키에 해당되는 값을 검색(데이터 목록에서 첫번재 값이 검색됨)
        Writable entry = reader.get(key, value);
        if (entry == null) {
            System.out.println("The requested key was not found.");
        }
 // next메소드는 위치를 다음 순서로 이동시키고, 키와 value parameter에는 현재위치의 키와 값을 설정
// 검색된 키가 존재한다면, 다음 키와 사용자 입력 키가 같은 경우에만 while문을 돌면서 값을 출력
       IntWritable nextKey = new IntWritable();
        do {
            System.out.println(value.toString());
        } while (reader.next(nextKey, value) && key.equals(nextKey));
        return 0;
    }
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new SearchValueList(),
                args);
        System.out.println("## RESULT:" + res);
    }
}
cs

- 실행1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
[hduser@hdstudy01 hadoop]$ hadoop jar alzio-hadoop-examples.jar  2000_map_file 1587
Exception in thread "main" java.io.FileNotFoundException: File does not exist: hdfs://hdstudy01:9000/user/hduser/2000_map_file/_SUCCESS/data
        at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:558)
        at org.apache.hadoop.fs.FileSystem.getLength(FileSystem.java:816)
        at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1479)
        at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1474)
        at org.apache.hadoop.io.MapFile$Reader.createDataFileReader(MapFile.java:302)
        at org.apache.hadoop.io.MapFile$Reader.open(MapFile.java:284)
        at org.apache.hadoop.io.MapFile$Reader.<init>(MapFile.java:273)
        at org.apache.hadoop.io.MapFile$Reader.<init>(MapFile.java:260)
        at org.apache.hadoop.io.MapFile$Reader.<init>(MapFile.java:253)
        at org.apache.hadoop.mapred.MapFileOutputFormat.getReaders(MapFileOutputFormat.java:93)
        at pj01.hadoop.chapter07.SearchValueList.run(SearchValueList.java:24)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
        at pj01.hadoop.chapter07.SearchValueList.main(SearchValueList.java:48)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:160)
[hduser@hdstudy01 hadoop]$
cs
※ 에러가 나는 이유 : MapFileOutputFormat의 getReaders메서드는 Parameter로 전달받은 폴더에 있는 모든 폴더에서 Mapfile이 존재하는지 다 체크함.

1
2
3
4
5
6
7
8
9
10
11
12
13
[hduser@hdstudy01 hadoop]$ hadoop fs -ls 2000_map_file
Found 3 items
-rw-r--r--   1 hduser supergroup          0 2017-06-13 17:28 /user/hduser/2000_map_file/_SUCCESS
drwxr-xr-x   - hduser supergroup          0 2017-06-13 17:26 /user/hduser/2000_map_file/_logs
drwxr-xr-x   - hduser supergroup          0 2017-06-13 17:28 /user/hduser/2000_map_file/part-00000
[hduser@hdstudy01 hadoop]$ hadoop fs -rmr 2000_map_file/_*
Deleted hdfs://hdstudy01:9000/user/hduser/2000_map_file/_SUCCESS
Deleted hdfs://hdstudy01:9000/user/hduser/2000_map_file/_logs
[hduser@hdstudy01 hadoop]$ hadoop fs -ls 2000_map_file
Found 1 items
drwxr-xr-x   - hduser supergroup          0 2017-06-13 17:28 /user/hduser/2000_map_file/part-00000
[hduser@hdstudy01 hadoop]$
cs
- 실행2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
[hduser@hdstudy01 hadoop]$ hadoop jar alzio-hadoop-examples.jar  2000_map_file 1587 | head -10
17/06/13 18:13:59 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/06/13 18:13:59 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
17/06/13 18:13:59 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:13:59 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:13:59 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:13:59 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:13:59 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:13:59 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:13:59 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:13:59 INFO compress.CodecPool: Got brand-new decompressor
2000,12,26,2,2339,2340,451,500,DL,538,N6709,192,200,174,-9,-1,PHX,ATL,1587,7,11,0,NA,0,NA,NA,NA,NA,NA
2000,12,23,6,2356,2340,502,500,DL,538,N639DL,186,200,165,2,16,PHX,ATL,1587,7,14,0,NA,0,NA,NA,NA,NA,NA
2000,12,22,5,2347,2340,455,500,DL,538,N628DL,188,200,168,-5,7,PHX,ATL,1587,7,13,0,NA,0,NA,NA,NA,NA,NA
2000,12,21,4,2342,2340,452,500,DL,538,N668DN,190,200,168,-8,2,PHX,ATL,1587,5,17,0,NA,0,NA,NA,NA,NA,NA
2000,12,20,3,2344,2340,451,500,DL,538,N654DL,187,200,167,-9,4,PHX,ATL,1587,7,13,0,NA,0,NA,NA,NA,NA,NA
2000,12,19,2,2343,2340,502,500,DL,538,N631DL,199,200,170,2,3,PHX,ATL,1587,7,22,0,NA,0,NA,NA,NA,NA,NA
2000,12,18,1,NA,2340,NA,500,DL,538,N751AT,NA,200,NA,NA,NA,PHX,ATL,1587,0,0,1,NA,0,NA,NA,NA,NA,NA
2000,12,17,7,2337,2340,454,500,DL,538,N683DA,197,200,168,-6,-3,PHX,ATL,1587,8,21,0,NA,0,NA,NA,NA,NA,NA
2000,12,16,6,NA,2340,NA,500,DL,538,N699DL,NA,200,NA,NA,NA,PHX,ATL,1587,0,0,1,NA,0,NA,NA,NA,NA,NA
2000,12,15,5,NA,2340,NA,500,DL,538,N6707A,NA,200,NA,NA,NA,PHX,ATL,1587,0,0,1,NA,0,NA,NA,NA,NA,NA
cs


  • 전체정렬

- 모든 MapReduce Job은 입력데이터의 키를 기준으로 정렬하기 때문에 단일 Partition의 모든 데이터가 정렬되어 출력되기 때문에, 수십GB의 큰 데이터를 정렬할 경우에는 문제가 발생.
- Reduce Task를 실행하는 DataNode만 부하가 집중되고, 그렇지 않은 DataNode는 가동되지 않아서, 분산처리의 장점을 살리지 못함.
- 해결 로직
       입력데이터를 샘플링하여 데이터 분포도 조사
   → 분포도에 맞게 파티션 정보 생성
   → 파티션 정보에 맞게 출력 데이터 생성
   → 각 출력 데이터를 병합

- hadoop.mapred.lib.TotalOrderPartitioner Class 제공
   : 전체정렬에서 사용하는 클래스, 파티션 개수와 파티션에 저장할 데이터 범위 설정

- InputSampler Class 제공
  : 각 파티션에 고르게 데이터 배분
  : 입력데이터에서 특정 개수의 데이터를 추출하여 키와 데이터 건수를 샘플링하여 분포도 작성

- 전체정렬 작동 방식

- 구현 - SequenceFileTotalSort.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
package pj01.hadoop.chapter06;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.InputSampler;
import org.apache.hadoop.mapred.lib.TotalOrderPartitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SequenceFileTotalSort extends Configured implements Tool {
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf(), SequenceFileTotalSort.class);
        conf.setJobName("SequenceFileTotalSort");
 // 시퀀스파일로 생성된 항공운항 통계데이터를 입력데이터로 전달
        conf.setInputFormat(SequenceFileInputFormat.class);
// 설정한 파티션에 정렬된 데이터를 시퀀스 파일로 출력(TotalOrderPartitioner는
// sequenceFile을 사용하도록 최적화되어 있음)
        conf.setOutputFormat(SequenceFileOutputFormat.class);
// 항공운항 거리를 기준으로 정렬하므로 IntWritable로 설정
        conf.setOutputKeyClass(IntWritable.class);
// 파티셔너 설정(안할경우 HashPartitioner가 디폴트로 설정됨)
        conf.setPartitionerClass(TotalOrderPartitioner.class);
// 시퀀스 파일의 압축포맷 설정
        SequenceFileOutputFormat.setCompressOutput(conf, true);
        SequenceFileOutputFormat
                .setOutputCompressorClass(conf, GzipCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(conf,
                CompressionType.BLOCK);
 // 입출력 경로 설정
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));
// 경로를 지정된 디렉토리 하위에 _partitions로 설정 
        Path inputDir = FileInputFormat.getInputPaths(conf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
        Path partitionFile = new Path(inputDir, "_partitions");
// 파티션 경로 지정
        TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
// 10개의 입력 Split에서 0.1% 확률로 1000건의 데이터를 샘플링하도록 설정 
        InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(
                0.1100010);
// 파티션 파일에 샘플러가 제공하는 키의 정보를 설정
        InputSampler.writePartitionFile(conf, sampler);
// 각 task가 Partition 정보를 참조할 수 있도록 분산 캐시에 Partition정보 등록 
        URI partitionUri = new URI(partitionFile.toString() + "#_partitions");
        DistributedCache.addCacheFile(partitionUri, conf);
        DistributedCache.createSymlink(conf);
 // Job실행
        JobClient.runJob(conf);
        return 0;
    }
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(),
                new SequenceFileTotalSort(), args);
        System.out.println("## RESULT:" + res);
    }
}
cs

- 실행
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
[hduser@hdstudy01 hadoop]$ hadoop jar alzio-hadoop-examples.jar  2000_seq_file 2000_tot_sort
17/06/13 18:54:30 INFO mapred.FileInputFormat: Total input paths to process : 9
17/06/13 18:54:30 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/06/13 18:54:30 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
17/06/13 18:54:30 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:54:30 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:54:30 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:54:30 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 18:54:36 INFO lib.InputSampler: Using 1000 samples
17/06/13 18:54:36 INFO compress.CodecPool: Got brand-new compressor
17/06/13 18:54:37 INFO mapred.FileInputFormat: Total input paths to process : 9
17/06/13 18:54:37 INFO mapred.JobClient: Running job: job_201706091514_0012
17/06/13 18:54:38 INFO mapred.JobClient:  map 0% reduce 0%
17/06/13 18:54:53 INFO mapred.JobClient:  map 19% reduce 0%
17/06/13 18:54:57 INFO mapred.JobClient:  map 22% reduce 0%
17/06/13 18:55:11 INFO mapred.JobClient:  map 42% reduce 0%
17/06/13 18:55:14 INFO mapred.JobClient:  map 44% reduce 0%
(중략)
17/06/13 18:56:34 INFO mapred.JobClient:  map 100% reduce 95%
17/06/13 18:56:37 INFO mapred.JobClient:  map 100% reduce 98%
17/06/13 18:56:40 INFO mapred.JobClient:  map 100% reduce 100%
17/06/13 18:56:42 INFO mapred.JobClient: Job complete: job_201706091514_0012
17/06/13 18:56:42 INFO mapred.JobClient: Counters: 30
17/06/13 18:56:42 INFO mapred.JobClient:   Map-Reduce Framework
17/06/13 18:56:42 INFO mapred.JobClient:     Spilled Records=17049141
17/06/13 18:56:42 INFO mapred.JobClient:     Map output materialized bytes=604249649
17/06/13 18:56:42 INFO mapred.JobClient:     Reduce input records=5683047
17/06/13 18:56:42 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=19352649728
17/06/13 18:56:42 INFO mapred.JobClient:     Map input records=5683047
17/06/13 18:56:42 INFO mapred.JobClient:     SPLIT_RAW_BYTES=999
17/06/13 18:56:42 INFO mapred.JobClient:     Map output bytes=592883501
17/06/13 18:56:42 INFO mapred.JobClient:     Reduce shuffle bytes=604249649
17/06/13 18:56:42 INFO mapred.JobClient:     Physical memory (bytes) snapshot=1744535552
17/06/13 18:56:42 INFO mapred.JobClient:     Map input bytes=116427841
17/06/13 18:56:42 INFO mapred.JobClient:     Reduce input groups=1110
17/06/13 18:56:42 INFO mapred.JobClient:     Combine output records=0
17/06/13 18:56:42 INFO mapred.JobClient:     Reduce output records=5683047
17/06/13 18:56:42 INFO mapred.JobClient:     Map output records=5683047
17/06/13 18:56:42 INFO mapred.JobClient:     Combine input records=0
17/06/13 18:56:42 INFO mapred.JobClient:     CPU time spent (ms)=83010
17/06/13 18:56:42 INFO mapred.JobClient:     Total committed heap usage (bytes)=1236570112
17/06/13 18:56:42 INFO mapred.JobClient:   File Input Format Counters
17/06/13 18:56:42 INFO mapred.JobClient:     Bytes Read=116428966
17/06/13 18:56:42 INFO mapred.JobClient:   FileSystemCounters
17/06/13 18:56:42 INFO mapred.JobClient:     HDFS_BYTES_READ=116431189
17/06/13 18:56:42 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=1813335000
17/06/13 18:56:42 INFO mapred.JobClient:     FILE_BYTES_READ=1208499400
17/06/13 18:56:42 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=115845245
17/06/13 18:56:42 INFO mapred.JobClient:   File Output Format Counters
17/06/13 18:56:42 INFO mapred.JobClient:     Bytes Written=115845245
17/06/13 18:56:42 INFO mapred.JobClient:   Job Counters
17/06/13 18:56:42 INFO mapred.JobClient:     Launched map tasks=9
17/06/13 18:56:42 INFO mapred.JobClient:     Launched reduce tasks=1
17/06/13 18:56:42 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=102608
17/06/13 18:56:42 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
17/06/13 18:56:42 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=140969
17/06/13 18:56:42 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
17/06/13 18:56:42 INFO mapred.JobClient:     Data-local map tasks=9
## RESULT:0
[hduser@hdstudy01 hadoop]$
cs
Using 1000 samples
Total input paths to process : 9


- 결과확인
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
[hduser@hdstudy01 hadoop]$ hadoop fs -ls 2000_tot_sort
Found 3 items
-rw-r--r--   1 hduser supergroup          0 2017-06-13 18:56 /user/hduser/2000_tot_sort/_SUCCESS
drwxr-xr-x   - hduser supergroup          0 2017-06-13 18:54 /user/hduser/2000_tot_sort/_logs
-rw-r--r--   1 hduser supergroup  115845245 2017-06-13 18:55 /user/hduser/2000_tot_sort/part-00000
[hduser@hdstudy01 hadoop]$ hadoop fs -text 2000_tot_sort/part-00000 | head -10
17/06/13 19:01:47 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/06/13 19:01:47 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
17/06/13 19:01:47 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 19:01:47 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 19:01:47 INFO compress.CodecPool: Got brand-new decompressor
17/06/13 19:01:47 INFO compress.CodecPool: Got brand-new decompressor
11      2000,6,28,3,NA,0,NA,0,AA,1436,UNKNOW,NA,NA,NA,NA,NA,JFK,LGA,11,0,0,1,NA,0,NA,NA,NA,NA,NA
17      2000,3,9,4,1052,1030,1127,1108,US,940,N622AU,35,38,12,19,22,EWR,LGA,17,6,17,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,30,4,204,150,248,215,DL,1490,N637DL,44,25,10,33,14,MIA,FLL,21,5,29,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,26,7,146,150,214,215,DL,1490,N642DL,28,25,11,-1,-4,MIA,FLL,21,6,11,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,29,3,130,150,157,215,DL,1490,N690DL,27,25,12,-18,-20,MIA,FLL,21,6,9,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,27,1,145,150,223,215,DL,1490,N626DL,38,25,21,8,-5,MIA,FLL,21,6,11,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,28,2,126,150,151,215,DL,1490,N624DL,25,25,11,-24,-24,MIA,FLL,21,5,9,0,NA,0,NA,NA,NA,NA,NA
21      2000,3,31,5,149,150,220,215,DL,1490,N626DL,31,25,13,5,-1,MIA,FLL,21,6,12,0,NA,0,NA,NA,NA,NA,NA
21      2000,5,29,1,200,150,233,217,DL,1490,N624DL,33,27,14,16,10,MIA,FLL,21,8,11,0,NA,0,NA,NA,NA,NA,NA
21      2000,5,30,2,132,150,202,217,DL,1490,N695DA,30,27,12,-15,-18,MIA,FLL,21,7,11,0,NA,0,NA,NA,NA,NA,NA
text: Unable to write to output stream.
[hduser@hdstudy01 hadoop]$
cs
- 각 파티션은 키 순서대로 정렬되어 있음.
- 이 파일들을 합치면 전체 정렬한 효과를 보게 되는 것임.
- 정렬된 결과에 데이터를 검색할 필요가 있다면 부분정렬을 사용
- 단순히 정렬된 데이터만 필요하다면 전체 정렬을 사용

댓글

가장 많이 본 글