youzhengchuan 发表于 2016-12-20 11:04

hdfs启用KerBeros验证之后出现(GSS initiate failed))

本帖最后由 youzhengchuan 于 2016-12-20 11:09 编辑

目前只在Hadoop集群上配置了KerBeros认证(namenode,DataNode,journalnode)配置文件如下:
core-site.xml
<!-- hadoop security configure -->
      <property>
                <name>hadoop.security.authentication</name>
                <value>kerberos</value>
         </property>
         <property>
                <name>hadoop.security.authorization</name>
                <value>true</value>
         </property>
         <property>
                <name>hadoop.rpc.protection</name>
                <value>authentication</value>
         </property>
         <!-- hadoop security configure end -->


hdfs-site.xml
<!-- hadoop for kerberos configure -->
    <!-- Kerberos NN -->
    <property>
      <name>dfs.block.access.token.enable</name>
      <value>true</value>
    </property>
    <property>
      <name>dfs.namenode.keytab.file</name>
      <value>/etc/hadoop/hdfs.keytab</value>
    </property>
    <property>
      <name>dfs.namenode.kerberos.principal</name>
      <value>hdfs/_HOST@MYZONE.COM</value>
    </property>
    <property>
      <name>dfs.namenode.kerberos.internal.spnego.principal</name>
      <value>hdfs/_HOST@MYZONE.COM</value>
    </property>
    <!--<property>
      <name>dfs.namenode.kerberos.https.principal</name>
      <value>hdfs/_HOST@MYZONE.COM</value>
    </property>-->
    <!-- Kerberos DN -->
    <property>
      <name>dfs.datanode.keytab.file</name>
      <value>/etc/hadoop/hdfs.keytab</value>
    </property>
    <property>
      <name>dfs.datanode.kerberos.principal</name>
      <value>hdfs/_HOST@MYZONE.COM</value>
    </property>
    <!--<property>
      <name>dfs.datanode.kerberos.https.principal</name>
      <value>hdfs/_HOST@MYZONE.COM</value>
    </property>
    <property>
      <name>dfs.datanode.data.dir.perm</name>
      <value>700</value>
    </property>-->
    <property>
      <name>dfs.datanode.address</name>
      <value>0.0.0.0:61004</value>
    </property>
    <property>
      <name>dfs.datanode.http.address</name>
      <value>0.0.0.0:61006</value>
    </property>
    <property>
      <name>dfs.https.port</name>
      <value>50470</value>
    </property>
    <property>
      <name>dfs.http.policy</name>
      <value>HTTPS_ONLY</value>
    </property>
    <property>
      <name>dfs.data.transfer.protection</name>
      <value>integrity</value>
    </property>
    <property>
      <name>dfs.web.authentication.kerberos.keytab</name>
      <value>/etc/hadoop/hdfs.keytab</value>
    </property>
    <property>
      <name>dfs.web.authentication.kerberos.principal</name>
      <value>hdfs/_HOST@MYZONE.COM</value>
    </property>
    <!-- hadoop for kerberos configure end -->
    <!-- hadoop for kerberos configure for journalnode -->
    <property>
      <name>dfs.journalnode.keytab.file</name>
      <value>/etc/hadoop/hdfs.keytab</value>
    </property>
    <property>
      <name>dfs.journalnode.kerberos.principal</name>
      <value>hdfs/_HOST@MYZONE.COM</value>
    </property>
    <property>
      <name>dfs.journalnode.kerberos.internal.spnego.principal</name>
      <value>hdfs/_HOST@MYZONE.COM</value>
    </property>
    <!-- hadoop for kerberos configure for journalnode end-->

mapred.xml
<blockquote><?xml version="1.0"?>ssl-client.xml
<property>
<name>ssl.client.truststore.location</name>
<value>/root/truststore</value>
<description>Truststore to be used by clients like distcp. Must be specified.</description>
</property>

<property>
<name>ssl.client.truststore.password</name>
<value>changeit</value>
<description>Optional. Default value is "".</description>
</property>

<property>
<name>ssl.client.truststore.type</name>
<value>jks</value>
<description>Optional. The keystore file format, default value is "jks".
</description>
</property>

<property>
<name>ssl.client.truststore.reload.interval</name>
<value>10000</value>
<description>Truststore reload check interval, in milliseconds. Default value is 10000 (10 seconds).</description>
</property>

<property>
<name>ssl.client.keystore.location</name>
<value>/root/keystore</value>
<description>Keystore to be used by clients like distcp. Must be specified.</description>
</property>

<property>
<name>ssl.client.keystore.password</name>
<value>changeit</value>
<description>Optional. Default value is "".</description>
</property>

<property>
<name>ssl.client.keystore.keypassword</name>
<value>changeit</value>
<description>Optional. Default value is "".</description>
</property>

<property>
<name>ssl.client.keystore.type</name>
<value>jks</value>
<description>Optional. The keystore file format, default value is "jks".
</description>
</property>

ssl-server.xml
<property>
<name>ssl.server.truststore.location</name>
<value>/root/truststore</value>
<description>Truststore to be used by NN and DN. Must be specified.</description>
</property>

<property>
<name>ssl.server.truststore.password</name>
<value>changeit</value>
<description>Optional. Default value is "".</description>
</property>

<property>
<name>ssl.server.truststore.type</name>
<value>jks</value>
<description>Optional. The keystore file format, default value is "jks".
</description>
</property>

<property>
<name>ssl.server.truststore.reload.interval</name>
<value>10000</value>
<description>Truststore reload check interval, in milliseconds.
Default value is 10000 (10 seconds).
</description>
</property>

<property>
<name>ssl.server.keystore.location</name>
<value>/root/keystore</value>
<description>Keystore to be used by NN and DN. Must be specified.</description>
</property>

<property>
<name>ssl.server.keystore.password</name>
<value>changeit</value>
<description>Must be specified.</description>
</property>

<property>
<name>ssl.server.keystore.keypassword</name>
<value>changeit</value>
<description>Must be specified.
</description>
</property>

<property>
<name>ssl.server.keystore.type</name>
<value>jks</value>
<description>Optional. The keystore file format, default value is "jks".
</description>
</property>

keytab文件:
<blockquote># klist -ket /etc/hadoop/hdfs.keytab

通过start-dfs.sh启动hdfs集群之后,能看到namenode、zkfc、DataNode、journalnode,都已经正常启动。
问题:
1、在管理页面上看DataNode都是dead状态。
2、通过日志分析,namenode连接journalnode失败
2016-12-20 10:23:15,996 WARN org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager: Waited 19015 ms (timeout=20000 ms) for a response for selectInputStreams. No responses yet.
2016-12-20 10:23:16,982 WARN org.apache.hadoop.hdfs.server.namenode.FSEditLog: Unable to determine input streams from QJM to . Skippi
ng.
java.io.IOException: Timed out waiting 20000ms for a quorum of nodes to respond.
      at org.apache.hadoop.hdfs.qjournal.client.AsyncLoggerSet.waitForWriteQuorum(AsyncLoggerSet.java:137)
      at org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager.selectInputStreams(QuorumJournalManager.java:471)
      at org.apache.hadoop.hdfs.server.namenode.JournalSet.selectInputStreams(JournalSet.java:278)
      at org.apache.hadoop.hdfs.server.namenode.FSEditLog.selectInputStreams(FSEditLog.java:1463)
      at org.apache.hadoop.hdfs.server.namenode.FSEditLog.selectInputStreams(FSEditLog.java:1487)
      at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:644)
      at org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:281)
      at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:1022)
      at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:741)
      at org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:538)
      at org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:597)
      at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:764)
      at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:748)
      at org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1441)
      at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1507)3、通过journalnode日志,发现namenode认证失败
2016-12-20 10:23:05,425 WARN org.apache.hadoop.util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2016-12-20 10:23:07,110 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.22:52383:null (GSS initiate failed)
2016-12-20 10:23:07,192 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:59799:null (GSS initiate failed)
2016-12-20 10:23:08,709 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.22:59076:null (GSS initiate failed)
2016-12-20 10:23:10,149 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:17869:null (GSS initiate failed)
2016-12-20 10:23:13,265 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.22:2416:null (GSS initiate failed)
2016-12-20 10:23:14,653 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:63330:null (GSS initiate failed)
2016-12-20 10:23:17,083 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.22:41827:null (GSS initiate failed)
2016-12-20 10:23:17,256 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:41136:null (GSS initiate failed)
2016-12-20 10:23:21,249 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:18681:null (GSS initiate failed)
2016-12-20 10:23:21,806 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.22:39155:null (GSS initiate failed)
2016-12-20 10:23:21,827 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:48970:null (GSS initiate failed)
2016-12-20 10:23:21,838 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:44755:null (GSS initiate failed)
2016-12-20 10:23:25,880 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:12379:null (GSS initiate failed)4、通过DataNode的日志分析,发现DataNode连接namenode超时
<blockquote>2016-12-20 10:56:05,183 WARN org.apache.hadoop.ipc.Client: Couldn't setup connection for hdfs/oitunnel-data-5@MYZONE.COM to oitunnel-data-2/10.11.97.22:533105、通过namenode日志分析,DataNode认证失败
2016-12-20 10:55:31,898 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.25:33131:null (GSS initiate failed)
2016-12-20 10:55:32,581 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.23:16775:null (GSS initiate failed)
2016-12-20 10:55:32,915 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.24:56752:null (GSS initiate failed)
2016-12-20 10:55:33,109 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:31381:null (GSS initiate failed)
2016-12-20 10:55:33,334 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.26:13542:null (GSS initiate failed)
2016-12-20 10:55:33,758 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.27:9355:null (GSS initiate failed)
2016-12-20 10:55:34,239 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.23:64310:null (GSS initiate failed)
2016-12-20 10:55:34,788 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.23:31716:null (GSS initiate failed)
2016-12-20 10:55:34,822 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.25:29585:null (GSS initiate failed)
2016-12-20 10:55:35,887 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.23:8182:null (GSS initiate failed)
2016-12-20 10:55:37,023 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.25:34919:null (GSS initiate failed)
2016-12-20 10:55:37,616 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.24:6497:null (GSS initiate failed)
2016-12-20 10:55:37,698 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:63277:null (GSS initiate failed)
2016-12-20 10:55:38,340 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.26:40629:null (GSS initiate failed)
2016-12-20 10:55:38,463 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.27:7985:null (GSS initiate failed)
2016-12-20 10:55:38,572 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.27:3231:null (GSS initiate failed)
2016-12-20 10:55:38,705 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:4416:null (GSS initiate failed)
2016-12-20 10:55:39,097 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.21:51185:null (GSS initiate failed)
2016-12-20 10:55:39,129 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.26:48846:null (GSS initiate failed)
2016-12-20 10:55:39,961 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.23:7155:null (GSS initiate failed)
2016-12-20 10:55:40,731 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.27:33864:null (GSS initiate failed)
2016-12-20 10:55:40,887 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.24:29617:null (GSS initiate failed)
2016-12-20 10:55:42,029 WARN SecurityLogger.org.apache.hadoop.ipc.Server: Auth failed for 10.11.97.25:19005:null (GSS initiate failed)

但是在KerBeros服务器上,能看到认证请求
<blockquote>Dec 20 10:23:21 oi-tunnel15 krb5kdc(info): TGS_REQ (5 etypes {17 16 23 1 3}) 10.11.97.21: ISSUE: authtime 1482200590, etypes {rep=17 tkt=18 ses=17}, hdfs/oitunnel-data-1@MYZONE.COM for hdfs/oitunnel-data-1@MYZONE.COM



youzhengchuan 发表于 2016-12-20 11:11

不知道为什么,有部分内容显示好像有限制,在这里补充。
keytab文件:
# klist -ket /etc/hadoop/hdfs.keytab
Keytab name: FILE:/etc/hadoop/hdfs.keytab
KVNO Timestamp Principal
---- ----------------- --------------------------------------------------------
3 12/19/16 21:42:23 hdfs/oitunnel-data-1@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-1@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-1@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-1@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-1@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-1@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-2@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-2@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-2@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-2@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-2@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-2@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-3@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-3@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-3@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-3@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-3@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-3@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-4@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-4@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-4@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-4@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-4@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-4@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-5@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-5@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-5@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-5@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-5@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-5@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-6@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-6@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-6@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-6@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-6@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-6@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-7@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 hdfs/oitunnel-data-7@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-7@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 mapred/oitunnel-data-7@MYZONE.COM (aes128-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-7@MYZONE.COM (aes256-cts-hmac-sha1-96)
3 12/19/16 21:42:23 yarn/oitunnel-data-7@MYZONE.COM (aes128-cts-hmac-sha1-96)
4 12/19/16 21:42:23 zookeeper/oi-tunnel08@MYZONE.COM (aes256-cts-hmac-sha1-96)
4 12/19/16 21:42:23 zookeeper/oi-tunnel08@MYZONE.COM (aes128-cts-hmac-sha1-96)
4 12/19/16 21:42:23 zookeeper/oi-tunnel09@MYZONE.COM (aes256-cts-hmac-sha1-96)
4 12/19/16 21:42:23 zookeeper/oi-tunnel09@MYZONE.COM (aes128-cts-hmac-sha1-96)
4 12/19/16 21:42:23 zookeeper/oi-tunnel10@MYZONE.COM (aes256-cts-hmac-sha1-96)
4 12/19/16 21:42:23 zookeeper/oi-tunnel10@MYZONE.COM (aes128-cts-hmac-sha1-96)

heguangwu 发表于 2016-12-20 11:20

这种情况可能的问题很多,首先检查一下hadoop主机时间是否同步,另外GSS initiate failed日志后面一般会跟上一个原因,如Caused by XXXX
页: [1]
查看完整版本: hdfs启用KerBeros验证之后出现(GSS initiate failed))