Hadoop简介

Hadoop是Apache软件基金会旗下的一个开源分布式计算平台,为用户提供了系统底层细节透明的分布式基础架构。Hadoop是基于Java语言开发的,具有很好的跨平台特性,并且可以部署在廉价的计算机集群中。Hadoop的核心是分布式文件系统(Hadoop Distributed File System,HDFS)和MapReduce。Apache Hadoop版本分为三代,分别是Hadoop 1.0、Hadoop 2.0和Hadoop3.0。除了免费开源的Apache Hadoop以外,还有一些商业公司推出Hadoop的发行版。2008年,Cloudera成为第一个Hadoop商业化公司,并在2009年推出第一个Hadoop发行版。此后,很多大公司也加入了做Hadoop产品化的行列,比如MapR、Hortonworks、星环等。2018年10月,Cloudera和Hortonworks宣布合并。一般而言,商业化公司推出的Hadoop发行版也是以Apache Hadoop为基础,但是前者比后者具有更好的易用性、更多的功能以及更高的性能。

单机安装

预先配置

关闭防火墙

1
2
3
4
# 关闭
systemctl stop firewalld
# 禁止开机自启
systemctl disable firewalld

修改主机名称和添加映射

1
2
3
4
5
6
# 修改主机名称
hostnamectl set-hostname hadoop

# 添加映射
vim /etc/hosts
192.168.131.144 hadoop

创建Hadoop用户

1
2
3
4
5
6
7
8
9
10
11
# 创建用户并使用 /bin/bash 作为shell
useradd -m hadoop -s /bin/bash

# 给hadoop用户设置密码,若提示密码无效,不用管,接着输入一次即可
passwd hadoop

# 给hadoop增加执行权限
visudo
#98行 输入 :98 跳转至98行,增加一行 hadoop ALL=(ALL) ALL
root ALL=(ALL) ALL
hadoop ALL=(ALL) ALL

设置SSH免密登陆

1
2
# 连续敲击3次回车
ssh-keygen -t rsa
1
2
# 查看生成的秘钥对
ls ~/.ssh/
1
2
# 追加公钥,执行命令后,根据提示输入 yes 再次回车
ssh-copy-id hadoop
1
2
# 查看生成的认证文件 authorized_keys
ls ~/.ssh/ authorized_keys id_rsa id_rsa.pub known_hosts
1
2
3
4
5
# 验证免密
ssh hadoop
ssh 192.168.131.144
# 退出
exit

安装JAVA并配置环境变量

1
2
3
# 上传至服务器并且解压更改名称
tar -xvzf jdk-8u351-linux-x64.tar.gz
mv jdk-8u351 jdk8
1
2
3
4
5
6
7
8
9
10
11
12
# 编辑配置文件
vim /etc/profile

#设置jdk环境变量
export JAVA_HOME=/home/hadoop/jdk8 #jdk安装目录
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib:$CLASSPATH
export JAVA_PATH=${JAVA_HOME}/bin:${JRE_HOME}/bin
export PATH=$PATH:${JAVA_PATH}

# 使配置文件生效
source /etc/profile
1
2
# 检查是否安装成功
java -version

安装Hadoop并配置环境变量

1
2
3
# 上传至服务器并且解压更改名称
tar -zxvf hadoop-3.1.3.tar.gz
mv hadoop-3.1.3 hadoop
1
2
# 查看是否可用
/home/hadoop/hadoop/bin/hadoop version
1
2
3
4
5
6
7
8
9
# 编辑profile文件
vim /etc/profile

# 增加hadoop环境变量
export HADOOP_HOME=/home/hadoop/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

# 使配置文件生效
source /etc/profile
1
hadoop version

单机非分布式运行

主要用来调试时使用

1
2
3
4
5
6
7
8
9
10
11
12
13
# 进入hadoop的安装路径
cd /home/hadoop/hadoop

# 将配置文件作为输入文件
mkdir ./input
cp ./etc/hadoop/*.xml ./input
# 运行
./bin/hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar grep ./input ./output 'dfs[a-z.]+'
# 查看
cat ./output/*

// rm -rf ./input
// rm -rf ./input

伪分布式运行

创建hadoop存放数据的目录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 切换路径 hadoop的安装路径
cd /home/hadoop/hadoop/

# 创建 temp 路径
mkdir temp
# 创建 dfs 路径
cd temp/
mkdir dfs
# 创建 name 和 data 文件夹
cd dfs/
mkdir name
mkdir data

ls
data name

修改配置文件

1
2
# hadoop 配置文件都在hadoop 安装目录下的 /etc/hadoop 中
cd /home/hadoop/hadoop/etc/hadoop
1
vim hadoop-env.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
vim core-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 指定 使用哪种文件系统-->
<property>
<name>fs.defaultFS</name>
<!-- 使用系hdfs分布式统-->
<!-- hdfs系统地址 hdfs://hdfs集群主节点名称:9000(默认端口号)-->
<!--因为是伪分布式,所有节点在同一台机子上,故节点名称为主机名-->
<value>hdfs://hadoop:9000</value>
</property>
<!-- 指定hadoop进程工作目录,hadoop运行时产生文件的存储路径-->
<property>
<name>hadoop.tmp.dir</name>
<!--数据放在hadoop的安装目录下是/tmp下-->
<value>/home/hadoop/hadoop/tmp/</value>
<description>A base for other temporary directories.</description>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
</configuration>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
vim hdfs-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<!--指定HDFS储存数据的副本数目,默认情况下为3份-->
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<!--name node 存放 name table 的目录-->
<name>dfs.namenode.name.dir</name>
<value>/home/hadoop/hadoop/tmp/dfs/name</value>
</property>
<property>
<!--data node 存放数据 block 的目录-->
<name>dfs.datanode.data.dir</name>
<value>/home/hadoop/hadoop/tmp/dfs/data</value>
</property>
<property>
<!--设置监控页面的端口及地址-->
<name>dfs.http.address</name>
<value>0.0.0.0:50070</value>
</property>
</configuration>
1
2
3
4
5
6
7
8
9
10
11
12
vim mapred-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 通知框架mapreduce使用YARN -->
<!-- 使得mapreduce在资源调度集群(yarn)上跑-->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
vim yarn-site.xml

<?xml version="1.0"?>
<configuration>
<!-- 配置yarn 集群主节点,因为是伪分布式,所以是本机-->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop</value>
</property>
<!-- reducer取数据的方式是mapreduce_shuffle -->
<!-- node-manager 从节点 -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>

修改环境变量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 编辑配置文件
vim /etc/profile

# 添加环境变量
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root

export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"

# 使配置文件生效
source /etc/profile

格式化集群文件系统

1
hadoop namenode -format

启动伪分布式

1
start-all.sh
1
jps

Web管理界面(关闭防火墙或者放行端口)

1
http://192.168.131.144:50070/
1
http://192.168.131.144:8088

关闭伪分布式

1
stop-all.sh

分布式安装(选作)

虚机分配

hadoop1:192.168.131.145 hadoop2:192.168.131.146 hadoop3:192.168.131.147
HDFS NameNode
DataNode
DataNode SecondaryNameNode
DataNode
YARN NodeManager ResourceManager
NodeManager
NodeManager

克隆3台虚拟机修改静态IP进行操作

准备工作

关闭防火墙(3台)

1
2
3
4
# 关闭
systemctl stop firewalld
# 禁止开机自启
systemctl disable firewalld

修改主机名称和修改映射(3台)

1
2
3
4
5
6
7
8
# 修改主机名称
hostnamectl set-hostname hadoop1

# 添加映射
vim /etc/hosts
192.168.131.145 hadoop1
192.168.131.146 hadoop2
192.168.131.147 hadoop3
1
2
3
4
5
6
7
8
# 修改主机名称
hostnamectl set-hostname hadoop2

# 添加映射
vim /etc/hosts
192.168.131.145 hadoop1
192.168.131.146 hadoop2
192.168.131.147 hadoop3
1
2
3
4
5
6
7
8
# 修改主机名称
hostnamectl set-hostname hadoop3

# 添加映射
vim /etc/hosts
192.168.131.145 hadoop1
192.168.131.146 hadoop2
192.168.131.147 hadoop3

设置SSH免密登陆(3台)

1
2
3
4
5
6
7
8
# 3次回车
ssh-keygen -t rsa
# 私钥和公钥 /root/.ssh 里
cd /root/.ssh
# 创建一个名为 authorized_keys 的文件
touch authorized_keys
# 将公钥写进这个文件
cat id_rsa.pub >> authorized_keys # 该文件当前只存贮了本机的公钥私钥
1
2
3
4
5
# 3台主机的全部公钥私钥在3台机器上全部存储(粗暴复制)

ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDkJjZnDNzV+m87Ox6RanAJW4Wl3ZXHJSumNcoXitLTew9NMWmliK7nl87iWOP7iAuqIwgv0mowaO9J8wDUjCqhv2gQymNDR+tKcX1DNIgrR8xdO2jHjmi2NsyihoMgVUbs27mFaznPiJprSEEznE9GZgoQ5C27QKxG2f03p1dWiPJHNfcf4EJO1V/2YI+u1hprKkoZFoZr7q6c5fTJkRSn6BBFXKHQOeQTTdrbeJA4zdUO/Rb/0jWTuhUFU52hfgQtzmEhDKiE2N19eG78p7QLvY4clcaYDg9m1p8I9HeXVMAp7pg+d3NadksXlzWEbyF5BTc3wHYXhC0bv6yhplI5 root@hadoop1
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs02adR7MDyNeVwarqoKlyNIfZjUHi8zgZjzbVTattM9GwvF/FC0rRzjOYjexpjpyA0IVOm5ililEwMX8yjJhhx7h004OKdl39O642dtr8Pnf/SEy8cLPilnGO0n3lXIXzpqekTHQ3czlE3X9UDfG1BcYJcOIkD4ltlAXkS7bnosqoN/Eu82Dec1AsyjCITWayAQcC0fMsTlKKWgh8sBaXIjdYwrAh9WbZjyatYnWnTBxtqDXhZhDHc9QdWaq0pYtrMGQ7ZZ2weqfKDY9+7wEwBgkg7SmJhkwtVDiuO8Xef9geFmnHzKrpTCel+J3EfwHYOV0yMWAvZ2TFo271ogk3 root@hadoop2
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDpZFNA4qlYVrPcVIULo41h6y8qr2EJW7xj4JM9K3kLqFZ1tQw2voN9VA91GlOwKI1lvCVWyuwnkQaWLXHN3M56Jdpw9pupNt3LAXjEmLl+8Ze1vfbgv7DoVS9yJyJQn8V/AKZE3xl/AftrY8anMtYAIujwEn1M5Fra4ozLTfRRmB0Duax/Qgi2xCHaq90YldKcwI4pZ2nz1y9ffwhLR1wKQUJBGyhVIshw8a/vkrnjNGcUwa7fe55SelOEl5bSOgcTGeXHaQufGmnHslwrc91Qzd+W2peaF2ChDCoob+wIqJq7liqMeZw6go/IN+VSWkF4EXwKu4RPaY+bHj9EUlT9 root@hadoop3

安装JDK并配置环境变量(3台)

1
2
3
4
# 上传至服务器并且解压更改名称
tar -xvzf jdk-8u333-linux-i586.tar.gz
mv jdk1.8.0_333 jdk8
rm -rf jdk-8u333-linux-i586.tar.gz
1
2
3
4
5
6
7
8
9
10
11
12
# 编辑配置文件
vim /etc/profile

#设置jdk环境变量
export JAVA_HOME=/opt/jdk8 #jdk安装目录
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib:$CLASSPATH
export JAVA_PATH=${JAVA_HOME}/bin:${JRE_HOME}/bin
export PATH=$PATH:${JAVA_PATH}

# 使配置文件生效
source /etc/profile
1
2
# 检查是否安装成功
java -version

安装Hadoop并配置环境变量(3台)

1
2
3
4
# 上传至服务器并且解压更改名称
tar -zxvf hadoop-3.1.3.tar.gz
mv hadoop-3.1.3 hadoop
rm -rf hadoop-3.1.3.tar.gz
1
2
# 查看是否可用
/opt/hadoop/bin/hadoop version
1
2
3
4
5
6
7
8
9
# 编辑profile文件
vim /etc/profile

# 增加hadoop环境变量
export HADOOP_HOME=/opt/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

# 使配置文件生效
source /etc/profile
1
hadoop version

修改环境变量(3台)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 编辑配置文件
vim /etc/profile

# 添加环境变量
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root

export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"

# 使配置文件生效
source /etc/profile

修改配置文件(3台)

1
2
# hadoop 配置文件都在hadoop 安装目录下的 /etc/hadoop 中
cd /opt/hadoop/etc/hadoop
1
2
3
vim hadoop-env.sh

export JAVA_HOME=/opt/jdk8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
vim core-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!--指定namenode的地址-->
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop1:8020</value>
</property>
<!-- 指定 Hadoop 运行时产生文件的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/hadoop/data</value>
</property>
</configuration>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
vim hdfs-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!--nn web端访问地址-->
<property>
<name>dfs.namenode.http-address</name>
<value>hadoop1:9870</value>
</property>
<!-- 2nn web端访问地址 -->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoop3:9868</value>
</property>
</configuration>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
vim yarn-site.xml

<?xml version="1.0"?>
<configuration>
<!-- 指定 MR 走shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 指定 YARN 的 ResourceManager 的地址 -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop2</value>
</property>
</configuration>
1
2
3
4
5
6
7
8
9
10
11
vim mapred-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 指定 MR 运行在 Yarn 上 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
1
2
3
4
5
vim workers

hadoop1
hadoop2
hadoop3

格式化集群文件系统(3台)

1
hdfs namenode -format

启动分布式集群(3台)

1
2
start-all.sh
jps # 查看进程

Web管理界面(关闭防火墙或者放行端口)

1
2
hadoop1的IP NameNode
http://192.168.131.145:9870
1
2
hadoop2的IP ResourceManager
http://192.168.131.146:8088

关闭分布式集群(3台)

1
stop-all.sh

文件的操作综合应用

命令行操作

1
2
3
# 列出HDFS文件
hadoop fs -ls 文件夹的路径
hadoop fs -ls /
1
2
3
# 在HDFS中创建文件夹
hadoop fs -mkdir 文件夹名
hadoop fs -mkdir /testmkdir
1
2
3
# 删除HDFS中的文件或文件夹
hadoop fs -rm -r 文件夹名/文件名
hadoop fs -rm -r /testmkdir
1
2
3
# 上传文件到HDFS
hadoop fs –put ~/file / ======> ~/file:本地文件 /:HDFS文件路径
hadoop fs -put t.c /
1
2
3
# 查看HDFS下的某个文件
hadoop fs –cat 文件路径
hadoop fs -cat /t.c
1
2
3
# 将HDFS中的文件复制到本地系统中
hadoop fs -get HDFS中的文件名本地系统中的文件名 本地存放地址
hadoop fs -get /t.c /
1
2
3
# 修改HDFS中的文件和文件夹的名称
hadoop fs -mv /原HDFS中的文件名 /修改后的HDFS中的文件名
hadoop fs -mv /t.c /t1.c

HDFS API编程

代码链接 https://github.com/pepsi-wyl/HDFS_API
BUG 解决java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.1</version>
</dependency>
</dependencies>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
public class HDFSUtils {

/**
* 获取Configuration
*/
private static Configuration getConfiguration() {
Configuration configuration = new Configuration();
// 这里指定使用的是HDFS文件系统
configuration.set("fs.defaultFS", "hdfs://192.168.131.144:9000");
configuration.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
return configuration;
}

/**
* 获取FileSystem
*/
public static FileSystem getFileSystem() throws IOException {
// 通过如下的方式进行客户端身份的设置
System.setProperty("HADOOP_USER_NAME", "root");
// 通过FileSystem的静态方法获取文件系统客户端对象
FileSystem fileSystem = null;
return FileSystem.get(getConfiguration());
}

/**
* 关闭FileSystem
*/
public static void closeFileSystem(FileSystem fileSystem) throws IOException {
fileSystem.close();
}

}
1
2
3
4
5
6
7
8
9
10
11
12
// 列出HDFS文件
public class ListFiles {
public static void main(String[] args) throws IOException {
FileSystem fileSystem = HDFSUtils.getFileSystem();
FileStatus[] statuses = fileSystem.listStatus(new Path("/"));
for (FileStatus fileStatus : statuses) {
// fileStatus为文件状态存储文件的信息
System.out.println(fileStatus.getPath());
}
HDFSUtils.closeFileSystem(fileSystem);
}
}
1
2
3
4
5
6
7
8
9
10
11
12
// 创建文件夹
public class Mkdir {
public static void main(String[] args) throws IOException {
FileSystem fileSystem = HDFSUtils.getFileSystem();
boolean isOK = fileSystem.mkdirs(new Path("/javaAPI/"));
if (isOK)
System.out.printf("创建文件夹成功");
else
System.out.printf("创建文件夹失败");
HDFSUtils.closeFileSystem(fileSystem);
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
// 删除文件夹
public class Rmdir {
public static void main(String[] args) throws IOException {
FileSystem fileSystem = HDFSUtils.getFileSystem();
// true为递归删除 false单级目录删除
boolean isOK = fileSystem.delete(new Path("/javaAPI/"), false);
if (isOK)
System.out.printf("删除文件夹成功");
else
System.out.printf("删除文件夹失败");
HDFSUtils.closeFileSystem(fileSystem);
}
}
1
2
3
4
5
6
7
8
9
10
11
// 上传文件
public class Upload {
public static void main(String[] args) throws IOException {
FileSystem fileSystem = HDFSUtils.getFileSystem();
Path localFile = new Path("G:\\Coding\\java\\HDFS_API\\src\\main\\java\\Utils\\HDFSUtils.java");
Path remoteFile = new Path("/");
fileSystem.copyFromLocalFile(localFile, remoteFile);
System.out.println("上传文件成功");
HDFSUtils.closeFileSystem(fileSystem);
}
}
1
2
3
4
5
6
7
8
9
10
// 下载文件
public class Download {
public static void main(String[] args) throws IOException {
FileSystem fileSystem = HDFSUtils.getFileSystem();
Path remoteFile = new Path("/HDFSUtils.java");
Path localFile = new Path("G:\\Coding\\java\\HDFS_API\\src");
fileSystem.copyToLocalFile(remoteFile, localFile);
System.out.println("下载文件成功");
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
// 修改HDFS中的文件和文件夹的名称
public class RenameFile {
public static void main(String[] args) throws IOException {
FileSystem fileSystem = HDFSUtils.getFileSystem();
Path oldPath = new Path("/HDFSUtils.java");
Path newPath = new Path("/HDFSUtils_rename.java");
boolean rename = fileSystem.rename(oldPath, newPath);
if (rename)
System.out.println("重命名成功");
else
System.out.println("重命名失败");
HDFSUtils.closeFileSystem(fileSystem);
}
}

1
2
3
4
5
6
7
8
9
// 查看HDFS中的文件
public class CatFile {
public static void main(String[] args) throws IOException {
FileSystem fileSystem = HDFSUtils.getFileSystem();
FSDataInputStream in = fileSystem.open(new Path("/HDFSUtils.java"));
IOUtils.copyBytes(in, System.out, 1024);
HDFSUtils.closeFileSystem(fileSystem);
}
}

MapReduce基础编程

使用命令行编译打包词频统计程序

HDFS中创建文件

1
2
3
4
5
6
7
wordfile1.txt   
I love Spark
I love Hadoop

wordfile2.txt.txt
Hadoop is good
Spark is fast

上传到HDFS中

编写源文件

1
2
3
4
# 源文件编写在hadoop的安装路径下
cd /home/hadoop/hadoop

vim WordCount.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
* @author by pepsi-wyl
* @date 2022-10-18 16:44
*/

public class WordCount {
public WordCount() {
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCount.TokenizerMapper.class);
job.setCombinerClass(WordCount.IntSumReducer.class);
job.setReducerClass(WordCount.IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}

public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
private static final IntWritable one = new IntWritable(1);
private Text word = new Text();

public TokenizerMapper() {
}

public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
this.word.set(itr.nextToken());
context.write(this.word, one);
}
}
}

public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();

public IntSumReducer() {
}

public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
IntWritable val;
for (Iterator i$ = values.iterator(); i$.hasNext(); sum += val.get()) {
val = (IntWritable) i$.next();
}
this.result.set(sum);
context.write(key, this.result);
}
}
}

编译打包运行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 在hadoop的安装路径下操作
cd /home/hadoop/hadoop

# javac编译程序可以找到Hadoop相关的JAR包
export CLASSPATH="/home/hadoop/hadoop/share/hadoop/common/hadoop-common-3.3.0.jar:/home/hadoop/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-core-3.3.0.jar:/home/hadoop/hadoop/share/hadoop/common/lib/commons-cli-1.2.jar:$CLASSPATH"

# 编译程序
javac WordCount.java

# 将Java的3个.class可执行文件打包并命名为WordCount.jar
jar -cvf WordCount.jar *.class

# 运行jar包
./bin/hadoop jar WordCount.jar WordCount input output

# 查看结果
./bin/hadoop fs -cat output/*

编译

打包

运行

查看结果

排错

Hadoop:找不到或无法加载主类org.apache.hadoop.mapreduce.v2.app.MRAppMaster
https://blog.csdn.net/lianghecai52171314/article/details/103231176

使用Idea编译运行词频统计程序

HDFS中创建文件

1
2
3
4
5
6
7
wordfile1.txt   
I love Spark
I love Hadoop

wordfile2.txt.txt
Hadoop is good
Spark is fast

上传到HDFS中

maven工程

创建工程

导入依赖

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.1</version>
</dependency>
</dependencies>

编写源码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

/**
* @author by pepsi-wyl
* @date 2022-10-18 16:44
*/

public class WordCount {
public WordCount() {
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}

public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
private static final IntWritable one = new IntWritable(1);
private Text word = new Text();

public TokenizerMapper() {
}

public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
this.word.set(itr.nextToken());
context.write(this.word, one);
}
}
}

public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();

public IntSumReducer() {
}

public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
IntWritable val;
for (Iterator i$ = values.iterator(); i$.hasNext(); sum += val.get()) {
val = (IntWritable) i$.next();
}
this.result.set(sum);
context.write(key, this.result);
}
}
}

打包

上传服务器运行

1
2
3
4
5
# 运行jar包
./bin/hadoop jar WordCount-1.0-SNAPSHOT.jar WordCount input output

# 查看结果
./bin/hadoop fs -cat output/*

运行

查看结果

HBase基本使用

安装

下载和安装

版本对应关系

:::tips
HBase版本2.2.2 (注意兼容性)
https://hbase.apache.org/book.html#hadoop
:::

下载和解压

1
2
3
4
5
6
7
8
9
10
11
# 下载
wget https://archive.apache.org/dist/hbase/2.2.2/hbase-2.2.2-bin.tar.gz

# 解压
tar -zxvf hbase-2.2.7-bin.tar.gz

# 删除源码包
rm -rf hbase-2.2.7-bin.tar.gz

# 重命名
mv hbase-2.2.7 hbase

配置环境变量

1
2
3
4
5
6
7
8
# 编辑配置文件
vim /etc/profile

export HBASE_HOME=/home/hadoop/hbase
export PATH=$PATH:$HBASE_HOME/bin:/$HBASE_HOME/sbin

# 使配置文件生效
source /etc/profile

查看HBase版本信息

1
2
# 查看版本信息
hbase version

配置

配置hbase-env.sh

1
2
3
4
5
vim /home/hadoop/hbase/conf/hbase-env.sh

export JAVA_HOME=/home/hadoop/jdk8
export HBASE_CLASSPATH=/home/hadoop/hbase/conf
export HBASE_MANAGES_ZK=true

配置hbase-site.xml

1
vim /home/hadoop/hbase/conf/hbase-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>hbase.rootdir</name>
<!--IP为自己的虚机IP-->
<value>hdfs://192.168.131.144:9000/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.unsafe.stream.capability.enforce</name>
<value>false</value>
</property>
</configuration>

启动和停止

1
2
3
4
5
6
7
# 首先启动Hadoop

# 启动HBase
/home/hadoop/hbase/bin/start-hbase.sh

# 查看启动的进程
jps
1
2
3
4
5
6
# 停止HBase
/home/hadoop/hbase/bin/hbase-daemon.sh stop master
/home/hadoop/hbase/bin/stop-hbase.sh

# 查看启动的进程
jps

Shell命令

启动

1
2
# 启动HBase Shell
/home/hadoop/hbase/bin/hbase shell

创建表

1
2
3
4
5
6
7
8
# 创建表
create 'student','Sname','Ssex','Sage','Sdept','course'

# 查看表信息
describe 'student'

# 查看表
list

删除表

1
2
3
4
5
6
7
8
# 删除表  让表不可用
disable 'student'

# 删除表 让表不可用
drop 'student'

# 查看表
list

插入数据

1
2
3
4
5
6
# 插入数据
put 'student','95001','Sname','LiYing'
put 'student','95001','Ssex','male'
put 'student','95001','Sage','22'
put 'student','95001','Sdept','CS'
put 'student','95001','course:math','80'

查看数据

1
2
3
4
5
# 查看数据  查看表的某一个单元格数据
get 'student','95001'

# 查看数据 查看某个表的全部数据
scan 'student'

删除数据

1
2
3
4
5
# 删除数据 删除一列数据
delete 'student','95001','Ssex'

# 删除数据 删除一行数据
deleteall 'student','95001'

查询历史数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 创建teacher表,指定保存的版本数(假设指定为5)
create 'teacher',{NAME=>'username',VERSIONS=>5}

# 插入数据,并更新数据,使其产生历史版本数据
put 'teacher','91001','username','Mary'
put 'teacher','91001','username','Mary1'
put 'teacher','91001','username','Mary2'
put 'teacher','91001','username','Mary3'
put 'teacher','91001','username','Mary4'
put 'teacher','91001','username','Mary5'

# 查询指定查询的历史版本数
get 'teacher','91001',{COLUMN=>'username',VERSIONS=>5}
get 'teacher','91001',{COLUMN=>'username',VERSIONS=>3}

退出

1
exit

Idae编程

代码连接 https://github.com/pepsi-wyl/Hbase_API

Windows添加映射

1
2
3
4
# C:\Windows\System32\drivers\etc\hosts

# 添加映射
192.168.131.144 hadoop

创建Maven并且添加依赖

1
2
3
4
5
6
7
8
9
10
11
12
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>2.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.2.2</version>
</dependency>
</dependencies>

依赖版本要与HBase版本对应

编写代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package utils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;

import java.io.IOException;

/**
* @author by pepsi-wyl
* @date 2022-10-29 11:19
*/

public class ExampleForHBase {
public static Configuration configuration;
public static Connection connection;
public static Admin admin;

public static void main(String[] args) throws IOException {
init();
createTable("student", new String[]{"score"});
insertData("student", "zhangsan", "score", "English", "69");
insertData("student", "zhangsan", "score", "Math", "86");
insertData("student", "zhangsan", "score", "Computer", "77");
getData("student", "zhangsan", "score", "English");
close();
}

public static void init() {
configuration = HBaseConfiguration.create();
configuration.set("hbase.rootdir", "hdfs://192.168.131.144:9000/hbase");
configuration.set("hbase.zookeeper.quorum", "192.168.131.144");
configuration.set("hbase.zookeeper.property.clientPort", "2181");
try {
connection = ConnectionFactory.createConnection(configuration);
admin = connection.getAdmin();
} catch (IOException e) {
e.printStackTrace();
}
}

public static void close() {
try {
if (admin != null) {
admin.close();
}
if (null != connection) {
connection.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}

public static void createTable(String myTableName, String[] colFamily) throws IOException {
TableName tableName = TableName.valueOf(myTableName);
if (admin.tableExists(tableName)) {
System.out.println("talbe is exists!");
} else {
TableDescriptorBuilder tableDescriptor = TableDescriptorBuilder.newBuilder(tableName);
for (String str : colFamily) {
ColumnFamilyDescriptor family = ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes(str)).build();
tableDescriptor.setColumnFamily(family);
}
admin.createTable(tableDescriptor.build());
}
}

public static void insertData(String tableName, String rowKey, String colFamily, String col, String val) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
Put put = new Put(rowKey.getBytes());
put.addColumn(colFamily.getBytes(), col.getBytes(), val.getBytes());
table.put(put);
table.close();
}

public static void getData(String tableName, String rowKey, String colFamily, String col) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
Get get = new Get(rowKey.getBytes());
get.addColumn(colFamily.getBytes(), col.getBytes());
Result result = table.get(get);
System.out.println(new String(result.getValue(colFamily.getBytes(), col == null ? null : col.getBytes())));
table.close();
}
}

Zookeeper技术

安装

单机安装

这种配置方式下没有Zookeeper副本,所以如果Zookeeper服务器出现故障,Zookeeper服务将会停止。这种应用模式主要用在测试或demo的情况下,在生产环境下一般不会采用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 切换路径
cd /home/hadoop

# 解决颁发的证书已经过期问题
yum install -y ca-certificates

# 获取安装包
wget https://dlcdn.apache.org/zookeeper/zookeeper-3.6.3/apache-zookeeper-3.6.3-bin.tar.gz

# 解压安装包
tar -zxvf apache-zookeeper-3.6.3-bin.tar.gz

# 重命名
mv apache-zookeeper-3.6.3-bin zookeeper

# 赋予权限
sudo chown hadoop:hadoop -R /home/hadoop/zookeeper

# 删除安装包
rm -rf apache-zookeeper-3.6.3-bin.tar.gz
1
2
3
4
5
6
7
8
# 创建路径
mkdir /home/hadoop/zookeeper/one

# 创建数据目录
mkdir /home/hadoop/zookeeper/one/data

# 复制配置文件
cp /home/hadoop/zookeeper/conf/zoo_sample.cfg /home/hadoop/zookeeper/conf/zoo.cfg
1
2
3
4
5
6
# 修改配置文件
vim /home/hadoop/zookeeper/conf/zoo.cfg

ickTime=2000
dataDir=/home/hadoop/zookeeper/one/data
clientPort=2181
1
2
3
4
5
6
7
8
9
10
11
# 启动zookeeper服务
/home/hadoop/zookeeper/bin/zkServer.sh start

# 关闭zookeeper服务
/home/hadoop/zookeeper/bin/zkServer.sh stop

# 查看zookeeper服务的运行状态
/home/hadoop/zookeeper/bin/zkServer.sh status

# 查看系统进程,看到“QuorumPeerMain”进程,表示Zookeeper已经启动
jps

伪分布式安装

伪集群模式就是在单机模拟集群的Zookeeper服务。在Zookeeper的参数配置中,clientPort参数用来配置客户端连接Zookeeper的端口。伪分布式是使用每个配置文档模拟一台机器,也就是说,需要在单台机器上运行多个Zookeeper实例。但是必须要保证各个配置文档的clientPort不冲突即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 创建集群目录
mkdir /home/hadoop/zookeeper/master

mkdir /home/hadoop/zookeeper/master/conf
mkdir /home/hadoop/zookeeper/master/data
mkdir /home/hadoop/zookeeper/master/data/one
mkdir /home/hadoop/zookeeper/master/data/two
mkdir /home/hadoop/zookeeper/master/data/three
mkdir /home/hadoop/zookeeper/master/datalog
mkdir /home/hadoop/zookeeper/master/datalog/one
mkdir /home/hadoop/zookeeper/master/datalog/two
mkdir /home/hadoop/zookeeper/master/datalog/three

# 复制配置文件
cp /home/hadoop/zookeeper/conf/zoo_sample.cfg /home/hadoop/zookeeper/master/conf/zoo1.cfg
cp /home/hadoop/zookeeper/conf/zoo_sample.cfg /home/hadoop/zookeeper/master/conf/zoo2.cfg
cp /home/hadoop/zookeeper/conf/zoo_sample.cfg /home/hadoop/zookeeper/master/conf/zoo3.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
vim /home/hadoop/zookeeper/master/conf/zoo1.cfg
tickTime=2000
initLimit=10
syncLimit=5
dataDir=/home/hadoop/zookeeper/master/data/one
dataLogDir=/home/hadoop/zookeeper/master/datalog/one
clientPort=2181
server.1=127.0.0.1:2888:3888
server.2=127.0.0.1:2889:3889
server.3=127.0.0.1:2890:3890
maxClientCnxns=60


vim /home/hadoop/zookeeper/master/conf/zoo2.cfg
tickTime=2000
initLimit=10
syncLimit=5
dataDir=/home/hadoop/zookeeper/master/data/two
dataLogDir=/home/hadoop/zookeeper/master/datalog/two
clientPort=2182
server.1=127.0.0.1:2888:3888
server.2=127.0.0.1:2889:3889
server.3=127.0.0.1:2890:3890
maxClientCnxns=60

vim /home/hadoop/zookeeper/master/conf/zoo3.cfg
tickTime=2000
initLimit=10
syncLimit=5
dataDir=/home/hadoop/zookeeper/master/data/three
dataLogDir=/home/hadoop/zookeeper/master/datalog/three
clientPort=2183
server.1=127.0.0.1:2888:3888
server.2=127.0.0.1:2889:3889
server.3=127.0.0.1:2890:3890
maxClientCnxns=60
1
2
3
4
5
6
7
8
# myid文件缺失 https://blog.csdn.net/a_bang/article/details/72825929

vim /home/hadoop/zookeeper/master/data/one/myid
1
vim /home/hadoop/zookeeper/master/data/two/myid
2
vim /home/hadoop/zookeeper/master/data/three/myid
3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 启动
/home/hadoop/zookeeper/bin/zkServer.sh start /home/hadoop/zookeeper/master/conf/zoo1.cfg
/home/hadoop/zookeeper/bin/zkServer.sh start /home/hadoop/zookeeper/master/conf/zoo2.cfg
/home/hadoop/zookeeper/bin/zkServer.sh start /home/hadoop/zookeeper/master/conf/zoo3.cfg

# 查看状态
/home/hadoop/zookeeper/bin/zkServer.sh status /home/hadoop/zookeeper/master/conf/zoo1.cfg
/home/hadoop/zookeeper/bin/zkServer.sh status /home/hadoop/zookeeper/master/conf/zoo2.cfg
/home/hadoop/zookeeper/bin/zkServer.sh status /home/hadoop/zookeeper/master/conf/zoo3.cfg

# 关闭
/home/hadoop/zookeeper/bin/zkServer.sh stop /home/hadoop/zookeeper/master/conf/zoo1.cfg
/home/hadoop/zookeeper/bin/zkServer.sh stop /home/hadoop/zookeeper/master/conf/zoo2.cfg
/home/hadoop/zookeeper/bin/zkServer.sh stop /home/hadoop/zookeeper/master/conf/zoo3.cfg
1
/home/hadoop/zookeeper/bin/zkServer.sh start-foreground /home/hadoop/zookeeper/master/conf/zoo1.cfg

分布式安装(选作)

在这种模式下可以获得可靠的Zookeeper服务,只要集群中的大多数Zookeeper服务启动了,那么总的Zookeeper服务将是可用的。分布式模式下的配置与伪分布式最大的不同是Zookeeper实例分布在多台机器上。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# 切换路径
cd /opt

# 解决颁发的证书已经过期问题
yum install -y ca-certificates

# 获取安装包
wget https://dlcdn.apache.org/zookeeper/zookeeper-3.6.3/apache-zookeeper-3.6.3-bin.tar.gz

# 解压安装包
tar -zxvf apache-zookeeper-3.6.3-bin.tar.gz

# 重命名
mv apache-zookeeper-3.6.3-bin zookeeper

# 删除安装包
rm -rf apache-zookeeper-3.6.3-bin.tar.gz

# 创建数据目录
mkdir /opt/zookeeper/data

# 创建数据目录日志目录
mkdir /opt/zookeeper/datalog

# 复制配置文件
cp /opt/zookeeper/conf/zoo_sample.cfg /opt/zookeeper/conf/zoo.cfg

# 编写配置文件
vim /opt/zookeeper/conf/zoo.cfg

tickTime=2000
initLimit=10
syncLimit=5
dataDir=/opt/zookeeper/data
dataLogDir=/opt/zookeeper/datalog
clientPort=2182
server.1=192.168.131.145:2887:3887
server.2=192.168.131.146:2888:3888
server.3=192.168.131.147:2899:3899

# hadoop1中
vim /opt/zookeeper/data/myid
1

# hadoop2中
vim /opt/zookeeper/data/myid
2

# hadoop3中
vim /opt/zookeeper/data/myid
3

# 启动zookeeper服务
/opt/zookeeper/bin/zkServer.sh start

# 关闭zookeeper服务
/opt/zookeeper/bin/zkServer.sh stop

# 查看zookeeper服务的运行状态
/opt/zookeeper/bin/zkServer.sh status

操作

0.启动Cli

1
2
# 启动Cli
/home/hadoop/zookeeper/bin/zkCli.sh

1.创建Znodes

1
2
3
4
5
6
7
8
9
10
11
# 创建节点
create /path /data
create /FirstZnode "Myfirstzookeeper-pp"

# 创建顺序节点 flag:-s
create -s /path /data
create -s /FirstZnode "second-data"

# 创建临时节点 flag:-e
create -e /path /data
create -e /SecondZnode "Ephemeral-data"

2.获取数据

1
2
3
4
5
# 获取数据
get /path

get /FirstZnode
get /FirstZnode0000000001

3.Watch(监视)

1
2
3
4
# Watch(监视)
get /path [watch] 1

get /FirstZnode 1

4.设置数据

1
2
3
4
5
# 设置/更改数据
set /path /data

set /SecondZnode Data-updated
set /SecondZnode abc

5.创建znode子节点

1
2
3
4
# 创建znode子节点
create /parent/path/subnode/path /data

create /FirstZnode/Child1 firstchildren

6.列出znode的子节点

1
2
3
4
# 列出znode的子节点
ls /path

ls /FirstZnode

7.检查状态

1
2
3
4
# 检查状态
stat /path

stat /FirstZnode

8. 删除Znode

1
2
3
4
# 删除Znode
delete /path

delete /SecondZnode

数据仓库Hive

安装

安装Mysql

本次使用Docker安装Mysql docker详解为https://www.yuque.com/pepsiwyl/blog/ghlc1t
直接安装可以参考 https://www.yuque.com/pepsiwyl/blog/zalhzm

安装Docker

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 安装命令
curl -fsSL https://get.docker.com | bash -s docker --mirror Aliyun

# 将当前用户加入Docker组
sudo groupadd docker # 创建Docker组
sudo usermod -aG docker $USER # 将当前用户加入Docker组

# 启动docker
systemctl start docker # 开启服务
systemctl enable docker # 开机自启服务

# 验证doker版本
docker version


# 配置阿里云加速镜像 执行以下脚本即可
sudo mkdir -p /etc/docker
sudo tee /etc/docker/daemon.json <<-'EOF'
{
"registry-mirrors": ["https://fimlvmx5.mirror.aliyuncs.com"]
}
EOF
sudo systemctl daemon-reload
sudo systemctl restart docker

启动Mysql容器

1
2
# 安装mysql  账号root 密码root
docker run -p 3306:3306 -e MYSQL_ROOT_PASSWORD=root -v mysql:/var/lib/mysql -d --restart=always --name mysql8.0 mysql:8.0.24
1
2
# 查看mysql是否启动成功
docker ps

创建hive库

安装Hive

安装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# 切换路径
cd /opt

# 下载安装包 网址: https://dlcdn.apache.org/hive/
wget https://dlcdn.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz

# 解压
tar -zxvf apache-hive-3.1.2-bin.tar.gz

# 重命名
mv apache-hive-3.1.2-bin hive

# 删除安装包
rm -rf apache-hive-3.1.2-bin.tar.gz

# 添加环境变量
vim /etc/profile
#hive安装目录
export HIVE_HOME=/opt/hive
export PATH=$PATH:$HIVE_HOME/bin
# 更新环境变量
source /etc/profile

# 检测Hive是否安装成功
hive --version

添加jar包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 切换路径
cd /opt

# 下载rpm包
wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.24-1.el7.noarch.rpm

# 解压rpm包
rpm2cpio mysql-connector-java-8.0.24-1.el7.noarch.rpm | cpio -div

# 删除rpm包
rm -rf mysql-connector-java-8.0.24-1.el7.noarch.rpm

# 添加jar包
cp /opt/usr/share/java/mysql-connector-java.jar /opt/hive/lib

# 删除解压文件
rm -rf /opt/usr

配置Hive

1
2
3
4
5
6
7
8
# 切换路径
cd /opt/hive/conf

# 将hive-default.xml.template文件重命名为hive-default.xml
mv hive-default.xml.template hive-default.xml

# 新建一个文件hive-site.xml
vim hive-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<!--Mysql服务器IP地址-->
<value>jdbc:mysql://192.168.131.144:3306/hive?useUnicode=true&amp;serverTimezone=UTC&amp;useUnicode=true&amp;characterEncoding=utf8&amp;useSSL=false</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.cj.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
<description>password to use against metastore database</description>
</property>
</configuration>

执行初始化命令

1
schematool -dbType mysql -initSchema
BUG

解决方法 https://www.cnblogs.com/syq816/p/12632028.html

启动

1
2
# 启动之前需要先启动Hadoop   脚本: start-all.sh
hive

基本操作

创建数据库、表、视图

1
2
3
4
5
6
7
8
9
10
11
# 创建数据库hive
create database if not exists hive;

# 创建表
use hive;
create table if not exists usr(id bigint,name string,age int);
create table if not exists hive.usr(id bigint,name string,age int) location '/usr/local/hive/warehouse/hive/usr';
create table if not exists usr1 like usr;

# 创建视图
create view little_usr as select id,age from usr;

删除数据库、表、视图

1
2
3
4
5
6
7
8
9
# 删除数据库
drop database if not exists hive;
drop database if not exists hive cascade;

# 删除表
drop table if exists usr;

# 删除视图
drop view if exists little_usr;

修改数据库、表、视图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 修改数据库
alter database hive set dbproperties('edited-by'='lily');

# 修改表
alter table usr rename to user;
alter table usr add if not exists partition(age=10);
alter table usr drop if exists partition(age=10);
alter table usr change name username string after age;
alter table usr add columns(sex boolean);
alter table usr replace columns(newid bigint,newname string,newage int);
alter table usr set tabproperties(‘notes’=’the columns in usr may be null except id’);

# 修改视图
alter view little_usr set tabproperties(‘create_at’=’refer to timestamp’);

查看数据库、表、视图

1
2
3
4
5
6
# 查看数据库
show databases;

# 查看表和视图
use hive;
show tables;

描述数据库、表、视图

1
2
3
4
5
6
7
8
9
# 描述数据库
describe database hive;
describe database extended hive;

# 描述表和视图
describe hive.usr;
describe hive.little_usr;
describe extended hive.usr;
describe extended hive.little_usr;

向表中装载数据

1
2
3
4
5
# 把目录'/usr/local/data'下的数据文件中的数据装载进usr表并覆盖原有数据
load data local inpath '/usr/local/data' overwrite into table usr;

# 把目录'/usr/local/data'下的数据文件中的数据装载进usr表不覆盖原有数据
load data local inpath '/usr/local/data' into table usr;

向表中插入数据或从表中导出数据

1
2
3
4
5
# 向表usr1中插入来自usr表的数据并覆盖原有数据
insert overwrite table usr1 select * from usr where age=10;

# 向表usr1中插入来自usr表的数据并追加在原有数据后
insert into table usr1 select * from usr where age=10;

应用举例

1
2
3
4
5
6
7
8
9
10
11
12
13
# 创建处理的HDFS路径
hadoop fs -mkdir /user/root/input

cd /opt
echo "hello world" > file1.txt
echo "hello hadoop" > file2.txt

# 上传
hadoop fs -put file1.txt /user/root/input
hadoop fs -put file2.txt /user/root/input

# 删除本地文件
rm -rf file1.txt file2.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 启动Hive
hive

# 创建输入表
create table if not exists docs(line string);

# 删除输出表
drop table if exists word_count;

# 加载数据
load data inpath 'input' overwrite into table docs;

# 查看加载的数据
select * from docs;

# 开始分析
create table word_count as
select word, count(1) as count from
(select explode(split(line,' '))as word from docs) w
group by word
order by word;

# 查询结果
select * from word_count;

Pig技术

安装Pig

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 切换目录
cd /opt

# 下载安装包 网址: https://dlcdn.apache.org/pig/
wget https://dlcdn.apache.org/pig/pig-0.17.0/pig-0.17.0.tar.gz

# 解压
tar -zxvf pig-0.17.0.tar.gz

# 重命名
mv pig-0.17.0 pig

# 删除安装包
rm -rf pig-0.17.0.tar.gz

# 添加环境变量
vim /etc/profile
#pig安装目录
export PIG_HOME=/opt/pig
export PATH=$PATH:$PIG_HOME/bin
# 更新环境变量
source /etc/profile
1
2
# 检测pig版本
pig -version
1
2
# 检测pig是否安装成功
测试是否安装成功,使用pig命令进入,然后使用sh ls查看测试,能和增长运行表示成功。
1
2
# 本地模式
pig -x local
1
2
# MapReduce模式
pig -x mapreduce

实验操作

创建文件并上传HDFS

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# 切换路径
cd /opt

# 创建文件
vim student.txt
201000101:李勇:男:20:计算机软件及理论
201000102:王丽:女:19:计算机软件及理论
201000103:刘花:女:18:计算机应用技术
201000104:李肖:男:19:计算机系统结构
201000105:浩达:男:19:计算机系统结构
201000106:华克:男:19:计算机系统结构

# 创建文件
vim course.txt
01,English,4
02,Data Structure,2
03,DataBase,2
04,DB Design,3
05,C Language,3
06,Principles of Network,3
07,OS,3

# 创建文件
vim sc.txt
201000101,01,92
201000101,03,84
201000102,01,90
201000102,02,94
201000102,03,82
201000103,01,72
201000103,02,90
201000104,03,75

# 创建pig目录,专门存放这三个文件
hdfs dfs -mkdir /pig_test

# 上传文件
hdfs dfs -put student.txt /pig_test
hdfs dfs -put course.txt /pig_test
hdfs dfs -put sc.txt /pig_test

# 删除本地文件
rm -rf student.txt;
rm -rf course.txt;
rm -rf sc.txt;

实验操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
->pig

# 在Pig加载数据
a = load '/pig_test/student.txt' using PigStorage(':') as (sno:chararray,sname:chararray,sex:chararray,age:int,dept:chararray);
b = load '/pig_test/course.txt' using PigStorage(',') as (cno:chararray,cname:chararray,grade:chararray);
c = load '/pig_test/sc.txt' using PigStorage(',') as (sno:chararray,cno:chararray,score:float);

a_join_c = join a by sno,c by sno;
dump a_join_c;

abc_join = join b by cno,a_join_c by c::cno;
dump abc_join;

not_excl = filter abc_join by score<80;
dump not_excl;

foreach_data = foreach not_excl generate sname,cname,score;
dump foreach_data;
1
2
3
4
在hadoop集群上运行pig报如下错误:报错INFO  org.apache.hadoop.ipc.Client - Retrying connect to server: 0.0.0.0/0.0.0.0:10020. Already tried 0 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS)

解决方案:启动historyserver:
mapred --daemon start historyserver

Sqoop技术

安装Sqoop

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 切换目录
cd /opt

# 下载安装包 网址:https://archive.apache.org/dist/sqoop/
wget https://archive.apache.org/dist/sqoop/1.4.7/sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz

# 解压
tar -zxvf sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz

# 重命名
mv sqoop-1.4.7.bin__hadoop-2.6.0 sqoop

# 配置环境变量
vim /etc/profile
# sqoop安装目录
export SQOOP_HOME=/opt/sqoop
export PATH=$PATH:$SQOOP_HOME/bin
export CLASSPATH=$CLASSPATH:$SQOOP_HOME/lib
# 更新环境变量
source /etc/profile

# 删除安装包
rm -rf sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz
1
2
3
4
5
6
7
8
9
10
# 重写命名配置文件
mv /opt/sqoop/conf/sqoop-env-template.sh /opt/sqoop/conf/sqoop-env.sh

# 配置sqoop-env.sh文件
vim sqoop-env.sh
插入
export HADOOP_COMMON_HOME=/home/hadoop/hadoop
export HADOOP_MAPRED_HOME=/home/hadoop/hadoop
export HIVE_HOME=/opt/hive
export HIVE_CONF_DIR=//opt/hive/conf
1
2
# 拷贝mysql驱动
cp /opt/hive/lib/mysql-connector-java.jar /opt/sqoop/lib
1
2
3
# 测试连接数据库
sqoop list-databases --connect jdbc:mysql://127.0.0.1:3306/ --username root -P
->输入密码......
1
2
3
4
5
6
7
解决BUG报错 Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/commons/lang/StringUtils
cd /opt
wget https://mirrors.tuna.tsinghua.edu.cn/apache//commons/lang/binaries/commons-lang-2.6-bin.tar.gz
tar -zxvf commons-lang-2.6-bin.tar.gz
cp /opt/commons-lang-2.6/commons-lang-2.6.jar /opt/sqoop/lib
rm -rf commons-lang-2.6-bin.tar.gz
rm -rf commons-lang-2.6

实验操作

创建数据文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
drop database if exists userdb;
create database userdb;
use userdb;
drop table if exists emp;
drop table if exists emp_add;
drop table if exists emp_conn;

CREATE TABLE emp(
id INT NOT NULL,
name VARCHAR(100),
deg VARCHAR(100),
salary BIGINT,
dept VARCHAR(50)
);

CREATE TABLE emp_add(
id INT NOT NULL,
hno VARCHAR(50),
street VARCHAR(50),
city VARCHAR(50)
);

CREATE TABLE emp_conn(
id INT NOT NULL,
phno VARCHAR(50),
email VARCHAR(50)
);

insert into emp values(1201,'gopal','manager','50000','TP');
insert into emp values(1202,'manisha','Proof reader','50000','TP');
insert into emp values(1204,'prasanth','php dev','30000','AC');
insert into emp values(1205,'kranthi','admin','20000','TP');

insert into emp_add values(1201,'288A','vgiri','jublee');
insert into emp_add values(1202,'108I','aoc','sec-bad');
insert into emp_add values(1203,'144Z','pgutta','hyd');
insert into emp_add values(1204,'78B','old city','sec-bad');
insert into emp_add values(1205,'720X','hitec','sec-bad');

insert into emp_conn values(1201,'2356742','gopal@tp.com');
insert into emp_conn values(1202,'1661663','manisha@tp.com');
insert into emp_conn values(1203,'8887776','khalil@ac.com');
insert into emp_conn values(1204,'9988774','prasanth@ac.com');
insert into emp_conn values(1205,'1231231','kranthi@tp.com');

Sqoop的数据导入

导入HDFS中

1
2
3
4
5
6
# 导入
hadoop fs -rm -r /user/root/emp;
sqoop import --connect jdbc:mysql://192.168.131.144:3306/userdb --username root --password root --table emp --m 1;

# 查看
hadoop fs -cat /user/root/emp/part-m-00000

导入HIVE中

1
2
3
4
5
# 导入HIVE的时,默认目录中不能有当前表
hadoop fs -rm -r /user/root/emp;

# 导入HIVE
sqoop import --connect jdbc:mysql://192.168.131.144:3306/userdb --username root --password root --table emp --hive-import --m 1;
1
2
3
// ERROR tool.ImportTool: Import failed: java.io.IOException: java.lang.ClassNotFoundException: 
cp /opt/hive/lib/hive-common-3.1.2.jar /opt/sqoop/lib/
cp /opt/hive/lib/hive-exec-3.1.2.jar /opt/sqoop/lib/
1
2
->hive
select * from emp;

导入表数据子集

1
2
3
4
5
6
# 导入
hadoop fs -rm -r /wherequery;
sqoop import --connect jdbc:mysql://192.168.131.144:3306/userdb --username root --password root --target-dir /wherequery --query 'select id,name,deg from emp WHERE id>1203 and $CONDITIONS' --split-by id --fields-terminated-by '\t' --m 1;

# 查看
hadoop fs -cat /wherequery/part-m-*

Sqoop的数据导出

1
2
3
4
5
6
7
8
use userdb;
drop table if exists employee;
CREATE TABLE if not exists employee (
id INT NOT NULL PRIMARY KEY,
name VARCHAR(20),
deg VARCHAR(20),
salary INT,
dept VARCHAR(10));
1
2
# 导出
sqoop export --connect jdbc:mysql://192.168.131.144:3306/userdb --username root --password root --table employee --export-dir /user/root/emp;