Cfengine: Glasgow Worker Nodes

From GridPP Wiki
Jump to: navigation, search

This is a cfengine excerpt from Glasgow showing how we manage our worker nodes with cfengine. Note that it's probably a good idea to split your cfengine file into sections once it gets big, but this is presented as a single cfagent.conf file for simplicity.

Note that worker nodes are in the following classes: worker, grid, torque, autofs, scientific_sl_3 and, of course, any

##########################
#
# cfagent.conf for UKI-SCOTGRID-GLASGOW
#
# $Id: cfagent.conf 201 2006-11-22 12:43:00Z root $
#
##########################


groups:
        worker = ( HostRange(node,1-140) )
        gridsvr = ( HostRange(svr,016-023) )
        disksvr = ( HostRange(disk,032-041) )

        # Nicer names for grid servers
        ce      = ( svr016 )
        dpm     = ( svr018 )
        mon     = ( svr019 )
        ui      = ( svr020 )
        sitebdii = ( svr021 )

        ## Compound groups
        # Batch system nodes
        torque = ( ce worker )

        # Nodes which look at the autofs system
        autofs = ( worker ce ui )

        # All grid nodes
        grid = ( worker gridsvr disksvr )


control: 
        any::
                actionsequence = ( 
                        directories
                        files
                        links 
                        editfiles 
                        packages
                        copy
                        shellcommands
                        tidy
                        ) 

                domain = ( beowulf.cluster )
                skel = ( /var/cfengine/inputs/skel )
                scripts = ( /var/cfengine/inputs/scripts )
                syslog = ( on )
                ChecksumUpdates = ( on )

                DefaultPkgMgr = ( rpm )
                RPMcommand = ( /bin/rpm )
                RPMInstallCommand = ( "/usr/bin/yum -y install %s" )

        torque::
                torquesvr = ( 10.141.255.16 )
                torquequeues = ( dteam:atlas:alice:cms:lhcb:biom:pheno:zeus:sixt:ilc:babar:dzero:ngs:ops:glpnp:glpppt:glee:glbio )

        # It would be nicer if this could be defined more dynamically...
        scientific_sl_3::
                java = ( j2sdk1.4.2_12 )

        !gridsvr::
                # cfengine will run once an hour, so splay the cluster across 50 minutes
                # to ensure the load on the master server is not too high
                splaytime = ( 50 )

        gridsvr::
                splaytime = ( 5 )

directories:
        grid::
                # We need to create the locations for files to be copied into - copy runs before shellcommands
                /opt/glite/yaim mode=0700 owner=root group=root
                /opt/glite/yaim/etc mode=0755 owner=root group=root
                /opt/glite/yaim/functions/local mode=0755 owner=root group=root
                /etc/grid-security mode=0755 owner=root group=root

        scientific_sl_3::
                /usr/java/$(java) mode=0755 owner=root group=root

        torque::
                /var/spool/pbs/mom_priv mode=0755 owner=root group=root
                /gridstorage mode=0755 owner=root group=root
                /home mode=0755 owner=root group=root

links: 
        grid.scientific_sl_3::
                # In YAIM we give java location as /usr/java/current, and link here
                # (It would be much better if grid stuff just used /etc/java.conf or JAVA_HOME, *sigh*)
                /usr/java/current -> /usr/java/j2sdk1.4.2_12

        torque::
                /gridstorage/exptsw -> /grid/exp_soft

packages:
        any::
                ganglia-gmond action=install elsedefine=newgmon

        grid::
                lcg-CA action=install version=1.10
                # N.B. note that runyaim happens when the yaim package is first installed
                glite-yaim action=install elsedefine=runyaim

        worker::
                # Worker node meta package
                glite-WN action=install

        torque|ui::
                # Packages requested by VOs
                gcc action=install
                gcc-ssa action=install
                gcc-g77 action=install
                gcc-g77-ssa action=install
                zsh action=install
                zlib-devel action=install
                compat-libstdc++ action=install

tidy:
        any::
                # Make sure this is > max wallclock for the batch system!
                /tmp pattern=* age=12 recurse=inf

copy:
        # Master server is exempt from default files
        any.!svr031::
                # Root's environment
                $(skel)/common/root/.bash_profile mode=0644 dest=/root/.bash_profile type=sum
                $(skel)/common/root/.bashrc mode=0644 dest=/root/.bashrc type=sum
                $(skel)/common/root/.ssh/authorized_keys mode=0644 dest=/root/.ssh/authorized_keys type=sum
                # Security for servers and ssh
                $(skel)/common/etc/ssh/ssh_known_hosts mode=644 dest=/etc/ssh/ssh_known_hosts type=sum
                $(skel)/common/etc/ssh/ssh_config mode=644 dest=/etc/ssh/ssh_config define=newssh type=sum
                $(skel)/common/etc/ssh/sshd_config mode=600 dest=/etc/ssh/sshd_config define=newssh type=sum
                # Time, time, time!
                $(skel)/common/etc/ntp.conf mode=644 dest=/etc/ntp.conf define=newntp type=sum
                $(skel)/common/etc/ntp/step-tickers mode=644 dest=/etc/ntp/step-tickers define=newntp type=sum
                # Environment for interactive shells (and jobs)
                $(skel)/common/etc/profile.d/proxy.csh mode=644 dest=/etc/profile.d/proxy.csh type=sum
                $(skel)/common/etc/profile.d/proxy.sh mode=644 dest=/etc/profile.d/proxy.sh type=sum
                $(skel)/common/etc/profile.d/tmpdir.csh mode=644 dest=/etc/profile.d/tmpdir.csh type=sum
                $(skel)/common/etc/profile.d/tmpdir.sh mode=644 dest=/etc/profile.d/tmpdir.sh type=sum
                # Post boot signaling script
                # This is an important part of Glasgow's auto install - it signals to the master server when the first boot
                # after kickstart has happened.
                $(skel)/common/etc/rc.d/rc.local mode=644 dest=/etc/rc.d/rc.local type=sum


        grid::
                # GridPP VOMS + YAIM setup for workers
                $(skel)/grid/etc/grid-security/vomsdir/voms.gridpp.ac.uk mode=0644 dest=/etc/grid-security/vomsdir/voms.gridpp.ac.uk type=sum
                $(skel)/yaim/site-info.def mode=600 dest=/opt/glite/yaim/etc/site-info.def type=sum
                $(skel)/yaim/groups.conf mode=644 dest=/opt/glite/yaim/etc/groups.conf type=sum
                $(skel)/yaim/users.conf mode=644 dest=/opt/glite/yaim/etc/users.conf type=sum
                # We don't let YAIM do users - so override to a blank function
                $(skel)/yaim/local/config_users mode=644 dest=/opt/glite/yaim/functions/local/config_users type=sum

        torque::
                $(skel)/torque/shosts.equiv mode=644 dest=/etc/ssh/shosts.equiv type=sum

        torque|ui|dpm|disksvr::
                # On torque hosts (and the ui) distribute the shadow and password files to avoid problems with account locking, etc.
                # DPM and disk servers need this for gridftp
                $(skel)/torque/passwd mode=644 dest=/etc/passwd define=localpoolaccounts type=sum
                $(skel)/torque/shadow mode=400 dest=/etc/shadow type=sum
                $(skel)/torque/group mode=644 dest=/etc/group type=sum

        autofs::
                $(skel)/autofs/auto.cluster mode=0644 dest=/etc/auto.cluster define=newautofs type=sum
                $(skel)/autofs/auto.grid mode=0644 dest=/etc/auto.grid define=newautofs type=sum
                $(skel)/autofs/auto.master mode=0644 dest=/etc/auto.master define=newautofs type=sum

        gridsvr::
                $(skel)/gridsvr/etc/gmond.conf mode=0644 dest=/etc/gmond.conf define=newgmon type=sum

        worker::
                $(skel)/worker/etc/gmond.conf mode=0644 dest=/etc/gmond.conf define=newgmon type=sum
                # Worker nodes need to route directly to grid and disk servers even when their public IPs are given
                $(skel)/worker/etc/sysconfig/network-scripts/route-eth1 mode=0644 dest=/etc/sysconfig/network-scripts/route-eth1 define=needroute type=sum


shellcommands:
        newgmon::
                "/sbin/service gmond restart" umask=022

        newssh::
                "/sbin/service sshd restart" umask=022

        newntp::
                "/sbin/service ntpd restart" umask=022

        newautofs::
                "/sbin/service autofs restart" umask=022

        newtorque::
                "/sbin/service pbs_server restart" umask=022

        newmaui::
                "/sbin/service maui restart" umask=022

        newhttp::
                "/sbin/service httpd restart" umask=022

        localpoolaccounts.!ui::
                "/var/cfengine/inputs/scripts/local_pool_accounts /etc/passwd" umask=022

        worker.needroute::
                "/sbin/ip route add 130.209.239.16/28 dev eth1" umask=022
                "/sbin/ip route add 130.209.239.32/28 dev eth1" umask=022

        worker.runyaim::
                # Only define startmom if this looks ok, otherwise withdraw from the batch system
                "/opt/glite/yaim/scripts/configure_node /opt/glite/yaim/etc/site-info.def WN"  umask=022 define=startmom elsedefine=stopmom

        worker.startmom::
                "/sbin/chkconfig pbs_mom on" umask=022
                "/sbin/service pbs_mom restart" umask=022

        worker.stopmom::
                "/sbin/chkconfig pbs_mom off" umask=022
                "/sbin/service pbs_mom stop" umask=022