Recent Changes - Search:

Configuring a Xeon-Phi in Rocks Clusters

by Juan-Carlos Maureira

Introduction

Cluster Layout

Master node (frontend) configuration

Initial condition

[root@frontend ~]# rocks list host interface node
SUBNET  IFACE MAC               IP          NETMASK       MODULE NAME VLAN OPTIONS CHANNEL
private eth0  c8:1f:66:cf:25:8f 10.30.1.254 255.255.255.0 ------ node ---- ------- -------
ipmi    ipmi  ----------------- 10.30.0.23  255.255.255.0 ------ node ---- ------- -------
[root@frontend ~]# 

Install bridge-utils

[root@frontend ~]# yum install bridge-utils
Rocks-6.1.1                                              | 3.6 kB     00:00     
Rocks-6.1.1/primary_db                                   | 2.5 MB     00:00     
Setting up Install Process
Resolving Dependencies
--> Running transaction check
---> Package bridge-utils.x86_64 0:1.2-10.el6 will be installed
--> Finished Dependency Resolution

Dependencies Resolved

================================================================================
 Package             Arch          Version             Repository          Size
================================================================================
Installing:
 bridge-utils        x86_64        1.2-10.el6          Rocks-6.1.1         30 k

Transaction Summary
================================================================================
Install       1 Package(s)

Total download size: 30 k
Installed size: 57 k
Downloading Packages:
bridge-utils-1.2-10.el6.x86_64.rpm                       |  30 kB     00:00     
Running rpm_check_debug
Running Transaction Test
Transaction Test Succeeded
Running Transaction
  Installing : bridge-utils-1.2-10.el6.x86_64                               1/1 
  Verifying  : bridge-utils-1.2-10.el6.x86_64                               1/1 

Installed:
  bridge-utils.x86_64 0:1.2-10.el6                                              

Complete!

[root@frontend ~]# 

After, keep the mac address and the private ip address of the interface to be bridged (eth0 in our case). Then, remove the eth0 from the database. Follow, add the eth0 interface only specifying the mac address. Then, in a separate command, set the channel to be "br0" to the eth0. Finally, add a new interface called br0 in the private subnet with the original ip of the eth0.

[root@frontend ~]# rocks remove host interface node eth0
[root@frontend ~]# rocks add host interface node eth0 mac=c8:1f:66:cf:25:8f
[root@frontend ~]# rocks set host interface channel node eth0 br0
[root@frontend ~]# rocks add host interface node br0 subnet=private \
                   ip=10.30.1.254 module=bridged name=node
[root@frontend ~]# 

The final configuration for node at the frontend should look like this:

[root@frontend ~]# rocks list host interface panotia
SUBNET  IFACE MAC               IP          NETMASK       MODULE  NAME VLAN OPTIONS CHANNEL
iscsi-1 eth4  00:0a:f7:41:72:b0 10.60.1.254 255.255.255.0 ------- node ---- ------- -------
------- eth0  c8:1f:66:cf:25:8f ----------- ------------- ------- ---- ---- ------- br0    
private br0   ----------------- 10.30.1.254 255.255.255.0 bridged node ---- ------- -------
ipmi    ipmi  ----------------- 10.30.0.23  255.255.255.0 ------- node ---- ------- -------

[root@frontend ~]# 

Compute node configuration (Xeon-Phi hosts)

Install the MPSS 3.5

[root@node ~]# micctrl -s
mic0: online (mode: linux image: /usr/share/mpss/boot/bzImage-knightscorner)
mic1: online (mode: linux image: /usr/share/mpss/boot/bzImage-knightscorner)
[root@node ~]# 

Stop the MPSS and generate the default init configuration

[root@node ~]# /etc/init.d/mpss stop
Shutting down Intel(R) MPSS:                               [  OK  ]

[root@node ~]#  micctrl --initdefaults
[Warning] mic0: Generating compatibility network 
config file /opt/intel/mic/filesystem/mic0/etc/sysconfig/network/ifcfg-mic0 for IDB.
[Warning]       This may be problematic at best and will be removed in a future release,
Check with the IDB release.
[Warning] mic1: Generating compatibility network 
config file /opt/intel/mic/filesystem/mic1/etc/sysconfig/network/ifcfg-mic0 for IDB.
[Warning]       This may be problematic at best and will be removed in a future release,
Check with the IDB release.

[root@node ~]# 

Edit the file /etc/mpss/default.conf and add the following line

[root@node ~]# echo "Bridge br0 External dhcp 9000" > /etc/mpss/default.conf
[root@node ~]# cat /etc/mpss/default.conf
# Common /etc files for all embedded Linux file systems
CommonDir /var/mpss/common

ExtraCommandLine "highres=off"

# MIC Console
Console "hvc0"

# MIC Shutdown timeout - Wait for orderly shutdown to complete
# via service MPSS stop/unload and micctrl --shutdown or --reboot and --wait
# +ve integer -> Time in seconds to wait for shutdown to complete before forcing reset
# -ve integer -> Infinite wait for orderly shutdown to complete
# 0           -> Forced shutdown or reset. NOT RECOMMENDED!
ShutdownTimeout 300

# Storage location and size for MIC kernel crash dumps
CrashDump /var/crash/mic 16

Bridge br0 External 10.30.1.254 24 9000
[root@node ~]# 
[root@node ~]# micctrl --network=dhcp --bridge=br0 --ip=dhcp -c yes  mic0 
[Warning] Adding an interface to a bridge will use the bridges netmask - ignoring netbits 24
[Warning] DHCP config will ignore ip address input 'dhcp'

          mic0: Changing network to external bridge br0

[root@node ~]# 


[root@node ~]# rocks add appliance mic membership='Xeon Phi' node='mic'
[root@node ~]# 
[root@node ~]# rocks add host alias panotia-mic0 mic0
[root@node ~]# rocks add host alias panotia-mic1 mic1
[root@node ~]# 
[root@node ~]# yum install mpss-ganglia-web-3.5-1.glibc2.12.2.x86_64.rpm 
[root@node ~]# 

At the Xeon Phi card.

[root@mic0 ~]# zypper ar http://10.30.1.10/install/rocks-dist/k1om mpss
[root@mic0 ~]# 
[root@mic0 ~]# zypper install ganglia
File 'repomd.xml' from repository 'mpss' is unsigned, continue? [yes/no] (no): y
Building repository 'mpss' cache [done]
Loading repository data...
Reading installed packages...
Resolving package dependencies...

The following NEW packages are going to be installed:
  ganglia libapr-1-0 libconfuse0 

3 new packages to install.
Overall download size: 271.0 KiB. After the operation, additional 1.1 MiB will be used.
Continue? [y/n/?] (y): y
Retrieving package libconfuse0-2.7-r1.k1om (1/3), 24.0 KiB (45.0 KiB unpacked)
Retrieving: libconfuse0-2.7-r1.k1om.rpm [done]
Retrieving package libapr-1-0-1.4.6-r0.k1om (2/3), 83.0 KiB (178.0 KiB unpacked)
Retrieving: libapr-1-0-1.4.6-r0.k1om.rpm [done]
Retrieving package ganglia-3.1.7-r0.k1om (3/3), 164.0 KiB (864.0 KiB unpacked)
Retrieving: ganglia-3.1.7-r0.k1om.rpm [done]
Installing: libconfuse0-2.7-r1 [done]
Installing: libapr-1-0-1.4.6-r0 [done]
Installing: ganglia-3.1.7-r0 [done]

[root@mic0 ~]#  zypper install mpss-ganglia 
Loading repository data...
Reading installed packages...
Resolving package dependencies...

The following NEW package is going to be installed:
  mpss-ganglia 

1 new package to install.
Overall download size: 14.0 KiB. After the operation, additional 35.0 KiB will be used.
Continue? [y/n/?] (y): y
Retrieving package mpss-ganglia-mpss-r0.k1om (1/1), 14.0 KiB (35.0 KiB unpacked)
Retrieving: mpss-ganglia-mpss-r0.k1om.rpm [done]
Installing: mpss-ganglia-mpss-r0 [done]

[root@mic0 ~]# 

Update the Ganglia configuration in the /etc/ganglia/gmond.conf. Specially the multicast ip to report (Rocks's ganglia uses a multicast address to collect monitoring data from compute nodes) and the Cluster information.

 /* UDP Channels for Send and Recv */                
udp_recv_channel {                  
        mcast_join = 224.0.0.3                                  
        port = 8649                                                             
}                                   
udp_send_channel {                                                           
        mcast_join = 224.0.0.3      
        port = 8649                 
}   

/* Cluster Specific attributes */
cluster {
	name = "My Cluster Name"
	owner = "My organization"
	latlong = "S33.45 W70.6667"
	url = "My web page"
} 

Autofs for accessing remote mount points

check connectivity to all remote servers before configure this. Note that the MIC is accessing the network as the host compute node is (bridged configuration). So, check the route back from remote servers to the MIC is operational and implement the required IP mascaraing for accessing remote networks from which the MIC is unreachable.

[root@host ~]# scp /etc/hosts mic0:/etc
[root@host ~]# 
[root@mic0 ~]# zypper install autofs 
[root@mic0 ~]# 
[root@mic0 ~]# cat /etc/auto.home
user1	-nfsvers=3	camanchaca.local:/users/user1
user2   -nfsvers=3	camanchaca.local:/users/user2
[root@mic0 ~]# 
[root@mic0 ~]# cat /etc/auto.share
apps	camanchaca.local:/export/&
bio 	camanchaca.local:/export/&
root    camanchaca.local:/export/&
[root@mic0 ~]# 

Running MIC native binaries from the host (binary offloading)

The intention of configuring this feature is to execute a native K1OM binary on the host and it automatically offload its execution to the MIC. This configuration works for executing native MIC binaries from the host queue manager as a consumable resource.

[root@node ~]# cat /usr/bin/runmic
!/bin/bash
export SINK_LD_LIBRARY_PATH=$SINK_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
export SINK_LD_LIBRARY_PATH=$SINK_LD_LIBRARY_PATH:/opt/intel/mic/lib64
export SINK_LD_LIBRARY_PATH=$SINK_LD_LIBRARY_PATH:/opt/intel/lib/mic
export SINK_LD_LIBRARY_PATH=$SINK_LD_LIBRARY_PATH:/opt/intel/mkl/lib/mic

export PATH=$PATH:`pwd`
cmd=$1
shift
args="$@"
args="${args//\'/\'}"
args="${args//\"/\\\"}"
/opt/intel/mic/bin/micnativeloadex $cmd -a "$args"
[root@node ~]# 

SSH version

[root@node ~]# cat /usr/bin/runmic
!/bin/bash
export SINK_LD_LIBRARY_PATH=$SINK_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
export SINK_LD_LIBRARY_PATH=$SINK_LD_LIBRARY_PATH:/opt/intel/mic/lib64
export SINK_LD_LIBRARY_PATH=$SINK_LD_LIBRARY_PATH:/opt/intel/lib/mic
export SINK_LD_LIBRARY_PATH=$SINK_LD_LIBRARY_PATH:/opt/intel/mkl/lib/mic

export PATH=$PATH:`pwd`
cmd=$1
shift
args="$@"
args="${args//\'/\'}"
args="${args//\"/\\\"}"
ssh mic0 -C "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; cd `pwd`; $cmd $args"
[root@node ~]# 

Add the K1OM binary fingerprint to the binary format register (all the register echoed to the /proc/sys/fs/binfmt_misc/register must be in a single line!!!, please DO NOT COPY PASTE IT directly to your terminal. Edit the register in order to echoed without newlines or feeds characters).

[root@node ~]# echo ':K1OM:M::\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xb5:
\xff\xff\xff\xff\xff\xfe\xfe\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfb\xff\xff:
/usr/bin/runmic:' > /proc/sys/fs/binfmt_misc/register
[root@node ~]# 
[root@node ~]# cat hello_world.mic.c
#include <stdlib.h>
#include <iostream>

int main(int argc, int* argv) {

	std::cout << "hello world from the MIC " << std::endl;
	return 0;
}
[root@node ~]# icc -mmic hello_world.mic.c -o hello_world.mic 
[root@node ~]# ./hello_world.mic 
hello world from the MIC 

[root@node ~]# 
[root@node ~]# 
[root@node ~]#