source: trunk/third/moira/dcm/dcm.pc @ 24319

Revision 24319, 14.7 KB checked in by broder, 14 years ago (diff)
New Moira snapshot from SVN.
Line 
1/* $Id: dcm.pc 3970 2010-01-31 05:52:04Z zacheiss $
2 *
3 * The Data Control Manager for Moira.
4 *
5 * Copyright (C) 1987-1998 by the Massachusetts Institute of Technology.
6 * For copying and distribution information, see the file
7 * <mit-copyright.h>.
8 */
9
10#include <mit-copyright.h>
11#include <moira.h>
12#include <moira_site.h>
13#include <moira_schema.h>
14#include "update.h"
15
16#include <sys/param.h>
17#include <sys/stat.h>
18#include <sys/wait.h>
19
20#include <errno.h>
21#include <signal.h>
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <unistd.h>
26
27EXEC SQL INCLUDE sqlca;
28void sqlglm(char *, unsigned int *, unsigned int *);
29
30RCSID("$HeadURL: svn+ssh://svn.mit.edu/moira/trunk/moira/dcm/dcm.pc $ $Id: dcm.pc 3970 2010-01-31 05:52:04Z zacheiss $");
31
32int generate_service(char *name, int force);
33void do_hosts(char *service);
34int dcm_send_file(char *service, int type, char *host, char *target);
35int dcm_execute(char *service, char *host, char *script);
36void dbmserr(void);
37
38#define SQL_NO_MATCH 1403
39#define SOFT_FAIL(x) (((x) == MR_NO_MEM) || ((x) == MR_CANT_CONNECT) || ((x) == MR_CCONFIG) || ((x) == MR_DEADLOCK) || ((x) == MR_BUSY) || ((x) == MR_ABORT))
40
41/* argument parsing macro */
42#define argis(a, b) (!strcmp(*arg + 1, a) || !strcmp(*arg + 1, b))
43
44char whobuf[256], *whoami = whobuf, *db = "moira";
45
46enum { UNIQUE, DISTRIBUTED, REPLICATED };
47
48int main(int argc, char **argv)
49{
50  int i, force = 0;
51  EXEC SQL BEGIN DECLARE SECTION;
52  char buf[SERVERS_NAME_SIZE], *name;
53  int enable;
54  EXEC SQL END DECLARE SECTION;
55  struct save_queue *sq;
56  int status, srvcnt = 0;
57  char **arg = argv, *services[BUFSIZ];
58
59  if (strchr(argv[0], '/'))
60    strcpy(whoami, strrchr(argv[0], '/') + 1);
61  else strcpy(whoami, argv[0]);
62  umask(7);
63
64  setvbuf(stderr, NULL, _IOLBF, BUFSIZ);
65  setvbuf(stdout, NULL, _IOLBF, BUFSIZ);
66
67  initialize_sms_error_table();
68  initialize_krb_error_table();
69
70  while (++arg - argv < argc)
71    {
72      if (**arg == '-')
73        {
74          if (argis("f", "force"))
75            force++;
76          else
77            {
78              com_err(whoami, 0, "Usage: %s [-f] servicename", argv[0]);
79              exit(1);
80            }
81        }
82      else
83        /* Doesn't begin with a dash, is a service name.
84         * Build an array of them we can iterate through later.
85         */
86        {
87          services[srvcnt] = malloc(SERVERS_NAME_SIZE);
88          if (!services[srvcnt])
89            {
90              com_err(whoami, 0, "Out of memory!");
91              exit(1);
92            }
93          strncpy(services[srvcnt], *arg, SERVERS_NAME_SIZE);
94          srvcnt++;
95        }
96    }
97
98  /* Iterate through services specified on the command line, if any. */
99  if (srvcnt > 0)
100    {
101      for (i = 0; i < srvcnt; i++)
102        {
103          if (generate_service(services[i], force))
104            {
105              do_hosts(services[i]);
106              free(services[i]);
107            }
108        }
109      exit(0);
110    }
111
112  /* if DCM is not enabled, exit after logging */
113  if (!access(NODCMFILE, F_OK))
114    {
115      printf("/etc/nodcm exists -- exiting\n");
116      exit(1);
117    }
118
119  EXEC SQL WHENEVER SQLERROR DO dbmserr();
120
121  EXEC SQL CONNECT :db IDENTIFIED BY :db;
122
123  EXEC SQL SELECT value INTO :enable FROM numvalues WHERE name = 'dcm_enable';
124  if (enable == 0)
125    {
126      printf("dcm_enable not set -- exiting\n");
127      exit(1);
128    }
129
130  /* fetch list of services */
131  EXEC SQL DECLARE csr_svc CURSOR FOR SELECT LOWER(name) FROM servers
132    WHERE enable = 1 AND harderror = 0 AND update_int > 0;
133  EXEC SQL OPEN csr_svc;
134  sq = sq_create();
135  while (1)
136    {
137      EXEC SQL FETCH csr_svc INTO :buf;
138      if (sqlca.sqlcode)
139        break;
140
141      sq_save_data(sq, strdup(strtrim(buf)));
142    }
143  EXEC SQL CLOSE csr_svc;
144  /* we will repeatedly open and close the db since it seems to get
145     upset if you keep it open across a fork */
146  EXEC SQL COMMIT RELEASE;
147
148  /* Now run through list */
149  while (sq_get_data(sq, &name))
150    {
151      if (generate_service(name, force))
152        {
153          switch (fork())
154            {
155            case -1:
156              com_err(whoami, errno, "forking for service %s -- exiting",
157                      name);
158              exit(1);
159            case 0:
160              sprintf(strchr(whoami, '\0'), " (%s:%ld)", name, (long)getpid());
161              do_hosts(name);
162              com_err(whoami, 0, "exiting");
163              exit(0);
164            default:
165              break;
166            }
167        }
168    }
169
170  com_err(whoami, 0, "All files generated. Waiting for children to exit");
171  while (waitpid(0, &status, 0) > 0)
172    ;
173  com_err(whoami, 0, "exiting");
174  exit(0);
175}
176
177int generate_service(char *name, int force)
178{
179  EXEC SQL BEGIN DECLARE SECTION;
180  int interval, dfcheck, status, inprogress;
181  time_t now;
182  const char *errmsg;
183  EXEC SQL END DECLARE SECTION;
184  char dfgen_prog[MAXPATHLEN], dfgen_cmd[2 * MAXPATHLEN];
185  struct sigaction action, prevaction;
186  int waits;
187
188  EXEC SQL CONNECT :db IDENTIFIED BY :db;
189
190  EXEC SQL SELECT update_int, dfcheck, inprogress INTO :interval, :dfcheck,
191    :inprogress FROM servers WHERE name = UPPER(:name);
192  if (sqlca.sqlcode == SQL_NO_MATCH)
193    {
194      com_err(whoami, 0, "No such service `%s'", name);
195      EXEC SQL COMMIT RELEASE;
196      return 0;
197    }
198
199  /* Someone might try to run a DCM from the command line while the
200   * regular one is running, which will bypass the "interval" test.
201   * Check inprogress to make sure they don't stomp on themselves.
202   *
203   * Note that there is still a race condition here, and this doesn't
204   * absolutely prevent 2 DCMs from stepping on one another, but it
205   * does reduce the window of vulnerability greatly.
206   */
207  if (inprogress == 1)
208    {
209      com_err(whoami, 0, "DCM for service `%s' already in progress", name);
210      EXEC SQL COMMIT RELEASE;
211      return 0;
212    }
213
214  time(&now);
215
216  if ((interval * 60 + dfcheck < now) || force)
217    {
218      sprintf(dfgen_prog, "%s/%s.gen", BIN_DIR, name);
219      if (access(dfgen_prog, F_OK) != 0)
220        {
221          com_err(whoami, 0, "prog %s doesn't exist", dfgen_prog);
222          EXEC SQL COMMIT RELEASE;
223          return 0;
224        }
225      sprintf(dfgen_cmd, "exec %s %s/%s.out", dfgen_prog, DCM_DIR, name);
226      com_err(whoami, 0, "running %s", dfgen_prog);
227
228      EXEC SQL WHENEVER SQLERROR GOTO gen_cleanup;
229
230      EXEC SQL UPDATE servers SET inprogress = 1
231        WHERE name = UPPER(:name);
232      EXEC SQL COMMIT;
233
234      action.sa_flags = 0;
235      sigemptyset(&action.sa_mask);
236      action.sa_handler = SIG_DFL;
237      sigaction(SIGCHLD, &action, &prevaction);
238      waits = system(dfgen_cmd);
239      sigaction(SIGCHLD, &prevaction, NULL);
240      if (WIFSIGNALED(waits))
241        {
242          status = MR_COREDUMP;
243          com_err(whoami, status, " %s exited on signal %d",
244                  dfgen_prog, WTERMSIG(waits));
245        }
246      else if (WEXITSTATUS(waits))
247        {
248          /* extract the process's exit value */
249          status = WEXITSTATUS(waits) + ERROR_TABLE_BASE_sms;
250          if (status != MR_NO_CHANGE)
251            com_err(whoami, status, "in %s", dfgen_prog);
252        }
253      else
254        status = MR_SUCCESS;
255
256      if (status == MR_SUCCESS)
257        {
258          EXEC SQL UPDATE servers SET dfgen = :now, dfcheck = :now,
259            inprogress = 0 WHERE name = UPPER(:name);
260          EXEC SQL COMMIT RELEASE;
261          return 1;
262        }
263      else if (status == MR_NO_CHANGE)
264        {
265          EXEC SQL UPDATE servers SET dfcheck = :now, inprogress = 0
266            WHERE name = UPPER(:name);
267        }
268      else if (SOFT_FAIL(status))
269        {
270          errmsg = error_message(status);
271          EXEC SQL UPDATE servers SET errmsg = :errmsg, inprogress = 0
272            WHERE name = UPPER(:name);
273        }
274      else /* HARD_FAIL(status) */
275        {
276          errmsg = error_message(status);
277          EXEC SQL UPDATE servers SET harderror = :status, errmsg = :errmsg,
278            inprogress = 0 WHERE name = UPPER(:name);
279          critical_alert(whoami, "DCM", "DCM building config files for %s: %s",
280                         name, errmsg);
281        }
282    }
283  else
284    {
285      com_err(whoami, 0, "DCM for service `%s' has run too recently.", name);
286      com_err(whoami, 0, "Use the -force flag to force a DCM.");
287    }
288
289  EXEC SQL COMMIT RELEASE;
290  return 0;
291
292gen_cleanup:
293  EXEC SQL WHENEVER SQLERROR DO dbmserr();
294  EXEC SQL UPDATE servers SET inprogress = 0, harderror = MR_INTERNAL,
295    errmsg = 'DBMS Internal Error' WHERE name = UPPER(:name);
296  dbmserr();
297}
298
299void do_hosts(char *service)
300{
301  EXEC SQL BEGIN DECLARE SECTION;
302  char server_type[SERVERS_TYPE_SIZE], host[MACHINE_NAME_SIZE], *name;
303  char target[SERVERS_TARGET_FILE_SIZE], script[SERVERS_SCRIPT_SIZE];
304  const char *errmsg;
305  int status = 0, dfgen, type, mid, inprogress;
306  time_t now;
307  EXEC SQL END DECLARE SECTION;
308  struct save_queue *sq;
309
310  time(&now);
311  mr_init();
312
313  EXEC SQL CONNECT :db IDENTIFIED BY :db;
314
315  EXEC SQL SELECT dfgen, type, target_file, script, inprogress
316    INTO :dfgen, :server_type, :target, :script, :inprogress
317    FROM servers WHERE name = UPPER(:service);
318  if (!strncmp(strtrim(server_type), "REPLICAT", 8))
319    type = REPLICATED;
320  else if (!strncmp(server_type, "DISTRIB", 8))
321    type = DISTRIBUTED;
322  else
323    type = UNIQUE;
324  strtrim(target);
325  strtrim(script);
326
327  /* Rudimentary locking.  Doesn't eliminate the possibility of 2 DCMs
328   * stepping on one another, but makes it harder.
329   */
330  if (inprogress == 1)
331    {
332      com_err(whoami, 0, "DCM for service `%s' already in progress", name);
333      EXEC SQL COMMIT RELEASE;
334      return;
335    }
336
337  EXEC SQL DECLARE csr_hst1 CURSOR FOR
338    SELECT m.name, m.mach_id FROM machine m, serverhosts sh
339    WHERE sh.service = UPPER(:service)
340    AND sh.enable = 1 AND sh.hosterror = 0
341    AND sh.lts < :dfgen AND sh.mach_id = m.mach_id;
342  EXEC SQL OPEN csr_hst1;
343  sq = sq_create();
344  while (1)
345    {
346      EXEC SQL FETCH csr_hst1 INTO :host, mid;
347      if (sqlca.sqlcode == SQL_NO_MATCH)
348        break;
349
350      sq_save_data(sq, strdup(strtrim(host)));
351      sq_save_data(sq, (void *)(long)mid);
352    }
353  EXEC SQL CLOSE csr_hst1;
354
355  EXEC SQL WHENEVER SQLERROR GOTO host_cleanup;
356  while (sq_get_data(sq, &name))
357    {
358      sq_get_data(sq, &mid);
359
360      EXEC SQL SELECT inprogress INTO :inprogress FROM serverhosts
361        WHERE service = UPPER(:service) AND mach_id = :mid;
362      /* Check if someone got here before we did.
363       * There's still a race condition here, but it's a small one. */
364      if (inprogress == 1)
365        {
366          com_err(whoami, 0, "DCM for service `%s' to host `%s' already in progress", service, name);
367          EXEC SQL COMMIT RELEASE;
368          return;
369        }
370
371      com_err(whoami, 0, "sending %s data to %s", service, name);
372      EXEC SQL UPDATE serverhosts SET inprogress = 1
373        WHERE service = UPPER(:service) AND mach_id = :mid;
374      EXEC SQL COMMIT;
375      status = dcm_send_file(service, type, name, target);
376      if (status)
377        {
378          errmsg = error_message(status);
379          EXEC SQL UPDATE serverhosts SET hosterrmsg = :errmsg,
380            inprogress = 0, success = 0, ltt = :now
381            WHERE service = UPPER(:service) AND mach_id = :mid;
382          if (!SOFT_FAIL(status))
383            {
384              EXEC SQL UPDATE serverhosts SET hosterror = :status
385                WHERE service = UPPER(:service) AND mach_id = :mid;
386              critical_alert(whoami, "DCM", "DCM updating %s:%s: %s",
387                             service, name, errmsg);
388            }
389          EXEC SQL COMMIT;
390
391          if (type == REPLICATED)
392            break;
393        }
394    }
395  sq_destroy(sq);
396
397  if (status == MR_SUCCESS || type != REPLICATED)
398    {
399      EXEC SQL DECLARE csr_hst2 CURSOR FOR
400        SELECT m.name, m.mach_id FROM machine m, serverhosts sh
401        WHERE sh.service = UPPER(:service) AND sh.inprogress = 1
402        AND sh.enable = 1 AND sh.hosterror = 0 AND sh.mach_id = m.mach_id;
403      EXEC SQL OPEN csr_hst2;
404      sq = sq_create();
405
406      while (1)
407        {
408          EXEC SQL FETCH csr_hst2 INTO :host, :mid;
409          if (sqlca.sqlcode == SQL_NO_MATCH)
410            break;
411
412          sq_save_data(sq, strdup(strtrim(host)));
413          sq_save_data(sq, (void *)(long)mid);
414        }
415      EXEC SQL CLOSE csr_hst2;
416
417      while (sq_get_data(sq, &name))
418        {
419          sq_get_data(sq, &mid);
420
421          com_err(whoami, 0, "executing instructions on %s", name);
422          status = dcm_execute(service, name, script);
423          if (status)
424            {
425              errmsg = error_message(status);
426              EXEC SQL UPDATE serverhosts SET hosterrmsg = :errmsg,
427                inprogress = 0, success = 0, ltt = :now
428                WHERE service = UPPER(:service) AND mach_id = :mid;
429              if (!SOFT_FAIL(status))
430                {
431                  EXEC SQL UPDATE serverhosts SET hosterror = :status
432                    WHERE service = UPPER(:service) AND mach_id = :mid;
433                  critical_alert(whoami, "DCM", "DCM updating %s:%s: %s",
434                                 service, name, errmsg);
435                }
436
437              if (type == REPLICATED)
438                break;
439            }
440          else
441            {
442              EXEC SQL UPDATE serverhosts SET inprogress = 0, ltt = :now,
443                lts = :now, success = 1 WHERE service = UPPER(:service)
444                AND mach_id = :mid;
445            }
446          EXEC SQL COMMIT;
447        }
448      EXEC SQL CLOSE csr_hst2;
449    }
450
451  if (type == REPLICATED)
452    {
453      /* Clear inprogress flag on any hosts we started but didn't
454       * finish.
455       */
456      EXEC SQL UPDATE serverhosts SET inprogress = 0
457        WHERE service = UPPER(:service);
458    }
459
460  EXEC SQL WHENEVER SQLERROR DO dbmserr();
461  if (status && !SOFT_FAIL(status) && type == REPLICATED)
462    {
463      EXEC SQL UPDATE servers SET harderror = :status, errmsg = :errmsg
464        WHERE name = UPPER(:service);
465    }
466
467  EXEC SQL COMMIT RELEASE;
468  return;
469
470host_cleanup:
471  EXEC SQL UPDATE serverhosts SET inprogress = 0, success = 0, ltt = :now,
472    hosterror = MR_INTERNAL, hosterrmsg = 'DBMS Internal Error'
473    WHERE service = UPPER(:service) AND mach_id = :mid;
474  if (type == REPLICATED)
475    {
476      EXEC SQL UPDATE servers SET harderror = MR_INTERNAL,
477        errmsg = 'DBMS Internal Error' WHERE name = UPPER(:service);
478    }
479}
480
481int dcm_send_file(char *service, int type, char *host, char *target)
482{
483  char data[MAXPATHLEN];
484  int code, conn;
485
486  conn = mr_connect_internal(host, "moira_update");
487  if (!conn)
488    {
489      com_err(whoami, errno, "can't connect to %s", host);
490      return MR_CANT_CONNECT;
491    }
492
493  code = mr_send_krb5_auth(conn, host);
494  if (code)
495    code = mr_send_auth(conn, host);
496  if (code)
497    {
498      com_err(whoami, code, "authenticating to %s", host);
499      goto done;
500    }
501
502  if (type == DISTRIBUTED)
503    sprintf(data, "%s/%s/%s", DCM_DIR, service, host);
504  else
505    sprintf(data, "%s/%s.out", DCM_DIR, service);
506  code = mr_send_file(conn, data, target, 0);
507  if (code)
508    com_err(whoami, code, "sending data to %s", host);
509
510done:
511  mr_send_quit(conn);
512  close(conn);
513  return code;
514}
515
516int dcm_execute(char *service, char *host, char *script)
517{
518  char inst[MAXPATHLEN];
519  int code, conn;
520
521  conn = mr_connect_internal(host, "moira_update");
522  if (!conn)
523    {
524      com_err(whoami, errno, "can't connect to %s", host);
525      return MR_CANT_CONNECT;
526    }
527
528  code = mr_send_krb5_auth(conn, host);
529  if (code)
530    code = mr_send_auth(conn, host);
531  if (code)
532    {
533      com_err(whoami, code, "authenticating to %s", host);
534      goto done;
535    }
536
537  sprintf(inst, "/tmp/moira-update.XXXXXX");
538  mkstemp(inst);
539  code = mr_send_file(conn, script, inst, 0);
540  if (code)
541    {
542      com_err(whoami, code, "sending instructions to %s", host);
543      goto done;
544    }
545
546  code = mr_execute(conn, inst);
547  if (code)
548    com_err(whoami, code, "executing instructions on %s", host);
549
550done:
551  mr_send_quit(conn);
552  close(conn);
553  return code;
554}
555
556void dbmserr(void)
557{
558  EXEC SQL BEGIN DECLARE SECTION;
559  char err_msg[256];
560  EXEC SQL END DECLARE SECTION;
561  int bufsize = 256, msglength = 0;
562
563  sqlglm(err_msg, &bufsize, &msglength);
564  err_msg[msglength] = '\0';
565  com_err(whoami, 0, "Encountered SQL error:\n%s", err_msg);
566  com_err(whoami, 0, "exiting");
567  exit(1);
568}
Note: See TracBrowser for help on using the repository browser.