Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
SlideShare a Scribd company logo
Hadoop
3    Hadoop
Hadoop

•
    -
    -
    -

•   HDFS(Hadoop Distributed Filesystem)
HDFS

•
    -
        ‣   MB, GB, TB

    -
        ‣
    -
        ‣
        ‣
HDFS

•
    -
        ‣
    -
        ‣
    -
        ‣
        ‣
HDFS

•
    -        64MB




    -
        ‣
        ‣
        ‣
HDFS

•
    -   /

    -       (   )

    -       (   )
HDFS

•
    -
               (           )

    -              (
                       )
HDFS

•
    -
    -

    -
HDFS

•

    -          (   ,   )

    -
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -
 open()
append()
 write()
           NameNode   SecondaryNameNode
HDFS

•
    -
 open()
append()
 write()
           NameNode   SecondaryNameNode
HDFS

•
    -
 open()
append()
 write()
           NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
•   hadoop fs -copyFromLocal <localsrc> ... <dst>

•   hadoop fs -copyToLocal <src> <localdst>

•   hadoop fs -ls <path>

•   hadoop fs -mkdir <path>


•   hadoop fs -help
Hadoop

                        •hadoop fs -ls file:///
                        •hadoop fs -ls hdfs:///
                        •hadoop fs -ls hftp:///
                         URI
                                                    java

        local             file          org.apache.hadoop.fs.localFileSystem
        HDFS             hdfs     org.apache.hadoop.hdfs.DistributesFileSystem
        HFTP             hftp         org.apache.hadoop.hdfs.HftpFileSystem
        HSFTP            hsftp       org.apache.hadoop.hdfs.HsftpFileSystem
        HAR               har          org.apache.hadoop.fs.HarFileSystem
         KFS              kfs       org.apache.hadoop.fs.kfs.KosmosFileSystem
         FTP              ftp         org.apache.hadoop.fs.ftp.FTPFileSystem
         S3
                         s3n     org.apache.hadoop.fs.s3native.NativeS3FileSystem
    (           )
         S3
                          s3            org.apache.hadoop.fs.S3FileSystem
(                   )
•   Thrift

•   C
    -   libhdfs




•   FUSE(FileSystem in Userspace)

•   WebDAV

•
    -   HTTP, FTP(           )
Java

    •   Hadoop URL

public class URLCat {
	 static {
	 	 URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
	 }

	   public static void main(String[] args) throws Exception {
	   	 InputStream in = null;
	   	 try {
	   	 	 in = new URL(args[0]).openStream();
	   	 	 IOUtils.copyBytes(in, System.out, 4096, false);
	   	 } finally {
	   	 	 IOUtils.closeStream(in);
	   	 }
	   }
}
Java

•   FileSystem API

    public class FileSystemCat {
    	 public static void main(String[] args) throws Exception {
    	 	 String uri = args[0];
    	 	 Configuration conf = new Configuration();
    	 	 FileSystem fs = FileSystem.get(URI.create(uri), conf);
    	 	 InputStream in = null;
    	 	 try {
    	 	 	 in = fs.open(new Path(uri));
    	 	 	 IOUtils.copyBytes(in, System.out, 4096, false);
    	 	 } finally {
    	 	 	 IOUtils.closeStream(in);
    	 	 }
    	 }
Java

•       FSDataInputStream
    public class FSDataInputStream extends DataInputStream
        implements Seekable, PositionedReadable {

          //
    }




        public interface Seekable {
            void seek(long pos) throws IOException;
            long getPos() throws IOException;
            boolean seekToNewSource(long targetPos) throws IOException;
        }
Java

  •   FSDataInputStream

public class FileSystemDoubleCat {
	 public static void main(String[] args) throws Exception {
	 	 String uri = args[0];
	 	 FileSystem fs = FileSystem.get(URI.create(uri), new Configuration());
	 	 FSDataInputStream in = null;
	 	 try {
	 	 	 in = fs.open(new Path(uri));
	 	 	 IOUtils.copyBytes(in, System.out, 4096, false);
	 	 	 in.seek(0);
	 	 	 IOUtils.copyBytes(in, System.out, 4096, false);
	 	 } finally {
	 	 	 IOUtils.closeStream(in);
	 	 }
	 }
}
Java

•       FSDataInputStream
    public class FSDataInputStream extends DataInputStream
        implements Seekable, PositionedReadable {

         //
    }




public interface PositionedReadable {
    int read(long position, byte buffer[], int offset, int length)
    throws IOException;
    void readFully(long position, byte buffer[], int offset, int length)
    throws IOException;
    void readFully(long position, byte buffer[]) throws IOException;
}
Java

•
    -   public FSDataOutputStream create(Path f)
        throws IOException

    -   public FSDataOutputStream append(Path f)
        throws IOException
Java

•   FSDateOutputStream
    -   FileSystem   create(), append()

    -

        public class FSDataOutputStream extends DataOutputStream
            implements Syncable {

            public long getPos() throws IOException {
                //
            }

            //
        }
Java

•
    -   public boolean mkdirs(Path f) throws IOException
Java

  •
FileStatus status = fs.getFileStatus(new Path("hdfs://localhost/hogehoge"));

status.isDir(); //
status.getLen();     //
status.getModificationTime();            //
status.getReplication();            //
status.getBlockSize();         //                  (   64MB)
status.getOwner();        //
status.getGroup();        //
status.getPermission().toString();            //
Java

•
    -   public FileStatus[] listStatus(Path f) throws IOException;

    -   public FileStatus[] listStatus(Path f, PathFilter filter)
        throws IOException;

    -   public FileStatus[] listStatus(Path[] files)
        throws IOException;

    -   public FileStatus[] listStatus(Path[] files, PathFilter filter)
        throws IOException;
Java

•
    public class ListStatus {
    	 public static void main(String[] args) throws Exception {
    	 	 String uri = args[0];
    	 	 Configuration conf = new Configuration();
    	 	 FileSystem fs = FileSystem.get(URI.create(uri), conf);
    	 	
    	 	 Path[] paths = new Path[args.length];
    	 	 for (int i = 0; i < paths.length; i++) {
    	 	 	 paths[i] = new Path(args[i]);
    	 	 }
    	 	
    	 	 FileStatus[] status = fs.listStatus(paths);
    	 	 for (FileStatus stat : status) {
    	 	 	 System.out.println(stat.getPath().toUri().getPath());
    	 	 }
    	 }
    }
Java

•
    -   public FileStatus[] globStatus(Path pathPattern) throws IOException

    -   public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
        throws IOException
Java

•


    [ab]                        {a,b}


    [^ab]                       {a,b}

                            {a,b}           (a b       )
    [a-b]
                    a       b
                    {a,b}           (a b           )       a   b
    [^a-b]

    {a,b}                               a    b


     ¥c                     c                      c
Java

•
    public interface PathFilter {
        boolean accept(Path path);
    }
Java

   •
         public class RegexExcludePathFilter implements PathFilter {

         	   private final String regex;
         	
         	   public RegexExcludePathFilter(String regex) {
         	   	 this.regex = regex;
         	   }
         	
         	   @Override
         	   public boolean accept(Path path) {
         	   	 return !path.toString().matches(regex);
         	   }
         }




fs.globStatus(new Path("/2007/*/*"), new RegexExcludePathFilter("^.*/2007/12/31$"));
Java

•
    -   public boolean delete(Path f, boolean recursive)
        throws IOException;
•
HDFS       DistributedFileSystem                              NameNode




            FSDataInputStream




               DateNode1             DateNode2    DateNode3              DateNode4



            block1                 block3        block1              block2

            block4                 block4        block2              block3
•
           open(new Path(“/aaa.txt”))
HDFS                                    DistributedFileSystem                              NameNode




                                         FSDataInputStream




                                            DateNode1             DateNode2    DateNode3              DateNode4



                                         block1                 block3        block1              block2

                                         block4                 block4        block2              block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (          )
HDFS                                    DistributedFileSystem                                     NameNode




                                         FSDataInputStream




                                            DateNode1                DateNode2        DateNode3              DateNode4



                                         block1                   block3             block1              block2

                                         block4                   block4             block2              block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (          )
HDFS                                    DistributedFileSystem                                     NameNode


                                                                                                   aaa.txt : block1, block2. block3, block4

                                                                                                   block1 : DataNode1, DataNode3
                                                                                                   block2 : DataNode3, DataNode4
                                         FSDataInputStream                                         block3 : DataNode2, DataNode3
                                                                                                   block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2        DateNode3                 DateNode4



                                         block1                   block3             block1                  block2

                                         block4                   block4             block2                  block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                   )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                 block3 : DataNode2, DataNode3      block3 : DataNode2, DataNode3
                                                                           block4 : DataNode1, DataNode2      block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                       block1                   block2

                                         block4                   block4                       block2                   block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                   )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                 block3 : DataNode2, DataNode3      block3 : DataNode2, DataNode3
                                                                           block4 : DataNode1, DataNode2      block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                       block1                   block2

                                         block4                   block4                       block2                   block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                   )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                 block3 : DataNode2, DataNode3      block3 : DataNode2, DataNode3
                                                                           block4 : DataNode1, DataNode2      block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                       block1                   block2

                                         block4                   block4                       block2                   block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                   )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                         read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

               close()                                                                                        block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                 block3 : DataNode2, DataNode3      block3 : DataNode2, DataNode3
                                                                           block4 : DataNode1, DataNode2      block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                       block1                   block2

                                         block4                   block4                       block2                   block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
    -
    -
    -
•
    -
    -                  9.1.1

    -
                                    (/d1/r1/n1, /d1/r1/n1) = 0
                  d1           d2   (/d1/r1/n1, /d1/r1/n2) = 2

                                    (/d1/r1/n1, /d1/r2/n3) = 4
        r1             r2      r3
                                    (/d1/r1/n1, /d2/r3/n4) = 6

n1           n2        n3      n4
•
HDFS       DistributedFileSystem                           NameNode




           FSDataOutputStream




               DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode




                                          FSDataOutputStream




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode




                                          FSDataOutputStream




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode




                                          FSDataOutputStream




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode

                       write()



                                          FSDataOutputStream




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode

                       write()



                                          FSDataOutputStream




                           ack




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode

                       write()



                                          FSDataOutputStream




                           ack




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode

                       write()



                                          FSDataOutputStream




                           ack




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                              NameNode

                       write()



                                          FSDataOutputStream                       block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2                DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                              NameNode

                       write()



                                          FSDataOutputStream                       block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2                DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                              NameNode

                       write()



                                          FSDataOutputStream                       block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2                DateNode3


                                            block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()


                  close()
                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                            ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()


                  close()
                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                            ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2   DateNode3


                                            block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block2
                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block2
                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block2
                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                   NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2      DateNode3


                                            block2
                                            block1                                 block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                   NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2      DateNode3


                                            block2
                                            block1                                 block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block2        block2
•
    -                      dfs.replication.min(        1)



    -   (dfs.replication                          3)




    -
•
    1.       (       )

    2.

    3.

    4.   (       )
•
    -

             fs.create(new Path("p"));




    -

        OutputStream out = fs.create(new Path("p"));
        out.write("content".getBytes("UTF-8"));
        out.flush();
•
    -   FSDataOutputStream sync()

    -   sync()   close()


                 FSDataOutputStream out = fs.create(new Path("p"));
                 out.write("content".getBytes("UTF-8"));
                 out.flush();
                 out.sync();
•
    -
        ‣   sync()

        ‣            sync()

        ‣   sync()
distcp

•   2        HDFS

    -   hadoop distcp hdfs://namenode1/foo hdfs://namenode2/bar

    -   hadoop distcp -overwrite hdfs://namenode1/foo hdfs://namenode2/bar/foo

    -   hadoop distcp -update hdfs://namenode1/foo hdfs://namenode2/bar/foo


•   MapReduce

    -                   256MB                 (1GB                 4             )

    -                                   map             (
                        )

    -                map        1        (tasktraker)             20map
Hadoop

•

•   HAR


•   hadoop archive -archiveName files.har /my/files /my
Hadoop

•
    -
                                             (
                )

    -
    -   HAR            MapReduce
                                     (   7.2.1.4
        CombineFileInputFormat   )
•   HDFS
    -
    -
    -
    -


•   distcp

•   HAR
•

•

More Related Content

第2回 Hadoop 輪読会

  • 1. Hadoop 3 Hadoop
  • 2. Hadoop • - - - • HDFS(Hadoop Distributed Filesystem)
  • 3. HDFS • - ‣ MB, GB, TB - ‣ - ‣ ‣
  • 4. HDFS • - ‣ - ‣ - ‣ ‣
  • 5. HDFS • - 64MB - ‣ ‣ ‣
  • 6. HDFS • - / - ( ) - ( )
  • 7. HDFS • - ( ) - ( )
  • 8. HDFS • - - -
  • 9. HDFS • - ( , ) -
  • 10. HDFS • - NameNode SecondaryNameNode
  • 11. HDFS • - open() append() write() NameNode SecondaryNameNode
  • 12. HDFS • - open() append() write() NameNode SecondaryNameNode
  • 13. HDFS • - open() append() write() NameNode SecondaryNameNode
  • 14. HDFS • - NameNode SecondaryNameNode
  • 15. HDFS • - NameNode SecondaryNameNode
  • 16. HDFS • - NameNode SecondaryNameNode
  • 17. HDFS • - NameNode SecondaryNameNode
  • 18. hadoop fs -copyFromLocal <localsrc> ... <dst> • hadoop fs -copyToLocal <src> <localdst> • hadoop fs -ls <path> • hadoop fs -mkdir <path> • hadoop fs -help
  • 19. Hadoop •hadoop fs -ls file:/// •hadoop fs -ls hdfs:/// •hadoop fs -ls hftp:/// URI java local file org.apache.hadoop.fs.localFileSystem HDFS hdfs org.apache.hadoop.hdfs.DistributesFileSystem HFTP hftp org.apache.hadoop.hdfs.HftpFileSystem HSFTP hsftp org.apache.hadoop.hdfs.HsftpFileSystem HAR har org.apache.hadoop.fs.HarFileSystem KFS kfs org.apache.hadoop.fs.kfs.KosmosFileSystem FTP ftp org.apache.hadoop.fs.ftp.FTPFileSystem S3 s3n org.apache.hadoop.fs.s3native.NativeS3FileSystem ( ) S3 s3 org.apache.hadoop.fs.S3FileSystem ( )
  • 20. Thrift • C - libhdfs • FUSE(FileSystem in Userspace) • WebDAV • - HTTP, FTP( )
  • 21. Java • Hadoop URL public class URLCat { static { URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory()); } public static void main(String[] args) throws Exception { InputStream in = null; try { in = new URL(args[0]).openStream(); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } } }
  • 22. Java • FileSystem API public class FileSystemCat { public static void main(String[] args) throws Exception { String uri = args[0]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); InputStream in = null; try { in = fs.open(new Path(uri)); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } }
  • 23. Java • FSDataInputStream public class FSDataInputStream extends DataInputStream implements Seekable, PositionedReadable { // } public interface Seekable { void seek(long pos) throws IOException; long getPos() throws IOException; boolean seekToNewSource(long targetPos) throws IOException; }
  • 24. Java • FSDataInputStream public class FileSystemDoubleCat { public static void main(String[] args) throws Exception { String uri = args[0]; FileSystem fs = FileSystem.get(URI.create(uri), new Configuration()); FSDataInputStream in = null; try { in = fs.open(new Path(uri)); IOUtils.copyBytes(in, System.out, 4096, false); in.seek(0); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } } }
  • 25. Java • FSDataInputStream public class FSDataInputStream extends DataInputStream implements Seekable, PositionedReadable { // } public interface PositionedReadable { int read(long position, byte buffer[], int offset, int length) throws IOException; void readFully(long position, byte buffer[], int offset, int length) throws IOException; void readFully(long position, byte buffer[]) throws IOException; }
  • 26. Java • - public FSDataOutputStream create(Path f) throws IOException - public FSDataOutputStream append(Path f) throws IOException
  • 27. Java • FSDateOutputStream - FileSystem create(), append() - public class FSDataOutputStream extends DataOutputStream implements Syncable { public long getPos() throws IOException { // } // }
  • 28. Java • - public boolean mkdirs(Path f) throws IOException
  • 29. Java • FileStatus status = fs.getFileStatus(new Path("hdfs://localhost/hogehoge")); status.isDir(); // status.getLen(); // status.getModificationTime(); // status.getReplication(); // status.getBlockSize(); // ( 64MB) status.getOwner(); // status.getGroup(); // status.getPermission().toString(); //
  • 30. Java • - public FileStatus[] listStatus(Path f) throws IOException; - public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException; - public FileStatus[] listStatus(Path[] files) throws IOException; - public FileStatus[] listStatus(Path[] files, PathFilter filter) throws IOException;
  • 31. Java • public class ListStatus { public static void main(String[] args) throws Exception { String uri = args[0]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path[] paths = new Path[args.length]; for (int i = 0; i < paths.length; i++) { paths[i] = new Path(args[i]); } FileStatus[] status = fs.listStatus(paths); for (FileStatus stat : status) { System.out.println(stat.getPath().toUri().getPath()); } } }
  • 32. Java • - public FileStatus[] globStatus(Path pathPattern) throws IOException - public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException
  • 33. Java • [ab] {a,b} [^ab] {a,b} {a,b} (a b ) [a-b] a b {a,b} (a b ) a b [^a-b] {a,b} a b ¥c c c
  • 34. Java • public interface PathFilter { boolean accept(Path path); }
  • 35. Java • public class RegexExcludePathFilter implements PathFilter { private final String regex; public RegexExcludePathFilter(String regex) { this.regex = regex; } @Override public boolean accept(Path path) { return !path.toString().matches(regex); } } fs.globStatus(new Path("/2007/*/*"), new RegexExcludePathFilter("^.*/2007/12/31$"));
  • 36. Java • - public boolean delete(Path f, boolean recursive) throws IOException;
  • 37. • HDFS DistributedFileSystem NameNode FSDataInputStream DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 38. open(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataInputStream DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 39. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode FSDataInputStream DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 40. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 41. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 42. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 43. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 44. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 45. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 46. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 47. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 48. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 49. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 close() block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 50. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 51. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 52. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 53. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 54. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 55. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 56. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 57. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 58. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 59. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 60. - - -
  • 61. - - 9.1.1 - (/d1/r1/n1, /d1/r1/n1) = 0 d1 d2 (/d1/r1/n1, /d1/r1/n2) = 2 (/d1/r1/n1, /d1/r2/n3) = 4 r1 r2 r3 (/d1/r1/n1, /d2/r3/n4) = 6 n1 n2 n3 n4
  • 62. • HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 63. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 64. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 65. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 66. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 67. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream ack DateNode1 DateNode2 DateNode3
  • 68. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream ack DateNode1 DateNode2 DateNode3
  • 69. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream ack DateNode1 DateNode2 DateNode3
  • 70. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3
  • 71. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3
  • 72. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3
  • 73. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3
  • 74. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1
  • 75. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  • 76. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 77. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 78. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 79. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 80. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 81. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 82. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() close() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 83. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() close() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 84. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3
  • 85. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1
  • 86. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  • 87. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  • 88. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  • 89. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1
  • 90. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1
  • 91. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1
  • 92. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 93. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 94. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 95. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 96. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 97. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block2
  • 98. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block2
  • 99. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block2 block2
  • 100. - dfs.replication.min( 1) - (dfs.replication 3) -
  • 101. 1. ( ) 2. 3. 4. ( )
  • 102. - fs.create(new Path("p")); - OutputStream out = fs.create(new Path("p")); out.write("content".getBytes("UTF-8")); out.flush();
  • 103. - FSDataOutputStream sync() - sync() close() FSDataOutputStream out = fs.create(new Path("p")); out.write("content".getBytes("UTF-8")); out.flush(); out.sync();
  • 104. - ‣ sync() ‣ sync() ‣ sync()
  • 105. distcp • 2 HDFS - hadoop distcp hdfs://namenode1/foo hdfs://namenode2/bar - hadoop distcp -overwrite hdfs://namenode1/foo hdfs://namenode2/bar/foo - hadoop distcp -update hdfs://namenode1/foo hdfs://namenode2/bar/foo • MapReduce - 256MB (1GB 4 ) - map ( ) - map 1 (tasktraker) 20map
  • 106. Hadoop • • HAR • hadoop archive -archiveName files.har /my/files /my
  • 107. Hadoop • - ( ) - - HAR MapReduce ( 7.2.1.4 CombineFileInputFormat )
  • 108. HDFS - - - - • distcp • HAR