Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
home:llunak:my
icecream
l-flto-all-slots.patch
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File l-flto-all-slots.patch of Package icecream
From e7941ae0b016862100d973c66c053f1bc61c27df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz> Date: Sat, 7 May 2022 17:57:03 +0200 Subject: [PATCH 1/3] make parallel -flto linking jobs reserve all local slots If linking uses parallel LTO, then it doesn't make sense to run more than one at the same time (they'd use more memory, sometimes causing OOM, and they'd compete for the CPU). So make such "full" local jobs reserve all local slots while running. --- client/arg.cpp | 22 ++++++++++++++++++++-- client/client.h | 11 +++++++++-- client/main.cpp | 9 +++++++-- daemon/main.cpp | 37 ++++++++++++++++++++++++++++--------- scheduler/scheduler.cpp | 3 ++- services/comm.cpp | 8 ++++++++ services/comm.h | 9 ++++++--- tests/Makefile.flto | 13 +++++++++++++ tests/flto-g++ | 15 +++++++++++++++ tests/test.sh | 39 ++++++++++++++++++++++++++++++++++++++- 10 files changed, 146 insertions(+), 20 deletions(-) create mode 100644 tests/Makefile.flto create mode 100755 tests/flto-g++ diff --git a/client/arg.cpp b/client/arg.cpp index 3a26dc4..260209d 100644 --- a/client/arg.cpp +++ b/client/arg.cpp @@ -286,7 +286,7 @@ static bool analyze_assembler_arg(string &arg, list<string> *extrafiles) } -bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<string> *extrafiles) +int analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<string> *extrafiles) { ArgumentsList args; string ofile; @@ -315,6 +315,7 @@ bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<s bool seen_march_native = false; bool seen_mcpu_native = false; bool seen_mtune_native = false; + std::string seen_parallel_flto; const char *standard = nullptr; // if rewriting includes and precompiling on remote machine, then cpp args are not local Argument_Type Arg_Cpp = compiler_only_rewrite_includes(job) ? Arg_Rest : Arg_Local; @@ -709,6 +710,15 @@ bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<s if (argv[i + 1]) { args.append(argv[++i], Arg_Rest); } + } else if (str_startswith("-flto", a)) { + args.append(a, Arg_Rest); + // If -flto will be parallel (and thus use all cores), make it a "full" job + // so that it reserves the entire local node. The only non-parallel -flto + // options appear to be GCC's -flto without arguments or -flto=1. + if( compiler_is_clang(job)) + seen_parallel_flto = a; + else if( !str_equal("-flto", a) && !str_equal("-flto=1", a)) + seen_parallel_flto = a; } else { args.append(a, Arg_Rest); @@ -946,5 +956,13 @@ bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<s << endl; #endif - return always_local; + int ret = 0; + if( always_local ) { + ret |= AlwaysLocal; + if( !seen_parallel_flto.empty() && !seen_c ) { + ret |= FullJob; + trace() << seen_parallel_flto << " and no -c, building with all local slots" << endl; + } + } + return ret; } diff --git a/client/client.h b/client/client.h index cbaf73d..a3b05a8 100644 --- a/client/client.h +++ b/client/client.h @@ -43,9 +43,16 @@ extern std::string remote_daemon; /* in remote.cpp */ extern std::string get_absfilename(const std::string &_file); +enum RunFlags +{ + None = 0, + AlwaysLocal = 1 << 0, // The job should be built locally. + FullJob = 1 << 1 // The job should reserve all slots (if AlwaysLocal). +}; /* In arg.cpp. */ -extern bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun, - std::list<std::string> *extrafiles); +// Returns RunFlags or-ed. +extern int analyse_argv(const char * const *argv, CompileJob &job, bool icerun, + std::list<std::string> *extrafiles); /* In cpp.cpp. */ extern pid_t call_cpp(CompileJob &job, int fdwrite, int fdread = -1); diff --git a/client/main.cpp b/client/main.cpp index a08828d..fdf4ac3 100644 --- a/client/main.cpp +++ b/client/main.cpp @@ -403,7 +403,12 @@ int main(int argc, char **argv) } list<string> extrafiles; - local |= analyse_argv(argv, job, icerun, &extrafiles); + bool fulljob = false; + int argv_result = analyse_argv(argv, job, icerun, &extrafiles); + if( argv_result & AlwaysLocal ) { + local = true; + fulljob = argv_result & FullJob; + } /* If ICECC is set to disable, then run job locally, without contacting the daemon at all. File-based locking will still ensure that all @@ -586,7 +591,7 @@ int main(int argc, char **argv) Msg *startme = nullptr; /* Inform the daemon that we like to start a job. */ - if (local_daemon->send_msg(JobLocalBeginMsg(0, get_absfilename(job.outputFile())))) { + if (local_daemon->send_msg(JobLocalBeginMsg(0, get_absfilename(job.outputFile()),fulljob))) { /* Now wait until the daemon gives us the start signal. 40 minutes should be enough for all normal compile or link jobs. */ startme = local_daemon->get_msg(40 * 60); diff --git a/daemon/main.cpp b/daemon/main.cpp index a893b31..75d540f 100644 --- a/daemon/main.cpp +++ b/daemon/main.cpp @@ -141,6 +141,7 @@ public: pipe_from_child = -1; pipe_to_child = -1; child_pid = -1; + fulljob = false; } static string status_str(Status status) { @@ -209,6 +210,7 @@ public: // pipe to child process, only valid if TOINSTALL/WAITINSTALL int pipe_to_child; pid_t child_pid; + bool fulljob; // during LINKJOB and CLIENTWORK, reserve all slots if set string pending_create_env; // only for WAITCREATEENV string dump() const { @@ -216,7 +218,8 @@ public: switch (status) { case LINKJOB: - return ret + " ClientID: " + toString(client_id) + " " + outfile + " PID: " + toString(child_pid); + return ret + " ClientID: " + toString(client_id) + " " + outfile + (fulljob ? " (full)" : "") + + " PID: " + toString(child_pid); case TOINSTALL: case WAITINSTALL: return ret + " ClientID: " + toString(client_id) + " " + outfile + " PID: " + toString(child_pid); @@ -1510,12 +1513,17 @@ bool Daemon::create_env_finished(string env_key) bool Daemon::handle_job_done(Client *cl, JobDoneMsg *m) { if (cl->status == Client::CLIENTWORK) { - clients.active_processes--; + if(cl->fulljob) + clients.active_processes -= std::max((unsigned int)1, max_kids); + else + clients.active_processes--; } cl->status = Client::JOBDONE; JobDoneMsg *msg = static_cast<JobDoneMsg *>(m); - trace() << "handle_job_done " << msg->job_id << " " << msg->exitcode << endl; + trace() << "handle_job_done " << msg->job_id << " " << (cl->fulljob ? "(full) " : "") + << msg->exitcode << endl; + cl->fulljob = false; if (!m->is_from_server() && (m->user_msec + m->sys_msec) <= m->real_msec) { @@ -1544,10 +1552,15 @@ void Daemon::handle_old_request() handle_end(client, 112); } else { client->status = Client::CLIENTWORK; - clients.active_processes++; - trace() << "pushed local job " << client->client_id << endl; - - if (!send_scheduler(JobLocalBeginMsg(client->client_id, client->outfile))) { + if(client->fulljob) { // reserve the entire node + clients.active_processes += std::max((unsigned int)1, max_kids); + trace() << "pushed full local job " << client->client_id << endl; + } else { + clients.active_processes++; + trace() << "pushed local job " << client->client_id << endl; + } + if (!send_scheduler(JobLocalBeginMsg(client->client_id, client->outfile, + client->fulljob))) { return; } } @@ -1723,8 +1736,12 @@ void Daemon::handle_end(Client *client, int exitcode) } if (client->status == Client::CLIENTWORK) { - clients.active_processes--; + if(client->fulljob) + clients.active_processes -= std::max((unsigned int)1, max_kids); + else + clients.active_processes--; } + client->fulljob = false; if (client->status == Client::WAITCOMPILE && exitcode == 119) { /* the client sent us a real good bye, so forget about the scheduler */ @@ -1861,8 +1878,10 @@ int Daemon::handle_cs_conf(ConfCSMsg *msg) bool Daemon::handle_local_job(Client *client, Msg *msg) { + JobLocalBeginMsg* m = dynamic_cast<JobLocalBeginMsg *>(msg); client->status = Client::LINKJOB; - client->outfile = dynamic_cast<JobLocalBeginMsg *>(msg)->outfile; + client->outfile = m->outfile; + client->fulljob = m->fulljob; return true; } diff --git a/scheduler/scheduler.cpp b/scheduler/scheduler.cpp index b1ea334..a5f1a24 100644 --- a/scheduler/scheduler.cpp +++ b/scheduler/scheduler.cpp @@ -567,7 +567,8 @@ static bool handle_local_job(CompileServer *cs, Msg *_m) } ++new_job_id; - trace() << "handle_local_job " << m->outfile << " " << m->id << endl; + trace() << "handle_local_job " << (m->fulljob ? "(full) " : "") << m->outfile + << " " << m->id << endl; cs->insertClientJobId(m->id, new_job_id); notify_monitors(new MonLocalJobBeginMsg(new_job_id, m->outfile, m->stime, cs->hostId())); return true; diff --git a/services/comm.cpp b/services/comm.cpp index ad53393..5332f7d 100644 --- a/services/comm.cpp +++ b/services/comm.cpp @@ -2281,6 +2281,11 @@ void JobLocalBeginMsg::fill_from_channel(MsgChannel *c) *c >> stime; *c >> outfile; *c >> id; + if (IS_PROTOCOL_44(c)) { + uint32_t full; + *c >> full; + fulljob = full; + } } void JobLocalBeginMsg::send_to_channel(MsgChannel *c) const @@ -2289,6 +2294,9 @@ void JobLocalBeginMsg::send_to_channel(MsgChannel *c) const *c << stime; *c << outfile; *c << id; + if (IS_PROTOCOL_44(c)) { + *c << (uint32_t) fulljob; + } } void JobLocalDoneMsg::fill_from_channel(MsgChannel *c) diff --git a/services/comm.h b/services/comm.h index 48258b0..3d5fa08 100644 --- a/services/comm.h +++ b/services/comm.h @@ -36,7 +36,7 @@ #include "job.h" // if you increase the PROTOCOL_VERSION, add a macro below and use that -#define PROTOCOL_VERSION 43 +#define PROTOCOL_VERSION 44 // if you increase the MIN_PROTOCOL_VERSION, comment out macros below and clean up the code #define MIN_PROTOCOL_VERSION 21 @@ -68,6 +68,7 @@ #define IS_PROTOCOL_41(c) ((c)->protocol >= 41) #define IS_PROTOCOL_42(c) ((c)->protocol >= 42) #define IS_PROTOCOL_43(c) ((c)->protocol >= 43) +#define IS_PROTOCOL_44(c) ((c)->protocol >= 44) // Terms used: // S = scheduler @@ -685,11 +686,12 @@ public: class JobLocalBeginMsg : public Msg { public: - JobLocalBeginMsg(int job_id = 0, const std::string &file = "") + JobLocalBeginMsg(int job_id = 0, const std::string &file = "", bool full = false) : Msg(M_JOB_LOCAL_BEGIN) , outfile(file) , stime(time(0)) - , id(job_id) {} + , id(job_id) + , fulljob(full) {} virtual void fill_from_channel(MsgChannel *c); virtual void send_to_channel(MsgChannel *c) const; @@ -697,6 +699,7 @@ public: std::string outfile; uint32_t stime; uint32_t id; + bool fulljob; }; class JobLocalDoneMsg : public Msg diff --git a/tests/Makefile.flto b/tests/Makefile.flto new file mode 100644 index 0000000..634df2c --- /dev/null +++ b/tests/Makefile.flto @@ -0,0 +1,13 @@ +all: flto1 flto2 + +# $(CXX) will be flto-g++ dummy, the only thing that matters is that there's no -c +# and that there is -flto=auto +flto1: prepare_lock + $(CXX) a1.cpp -flto=auto + +flto2: prepare_lock + $(CXX) a2.cpp -flto=auto + +# make sure there's no lockfile at the start +prepare_lock: + rm -f flto_lockfile diff --git a/tests/flto-g++ b/tests/flto-g++ new file mode 100755 index 0000000..bab881e --- /dev/null +++ b/tests/flto-g++ @@ -0,0 +1,15 @@ +#! /bin/sh + +# the lockfile has been removed by the Makefile, so if it exists, +# it means another job is running in parallel -> fail +if test -f ./flto_lockfile; then + echo flto-g++ found a lock file >&2 + exit 1 +fi + +# create a lockfile while running to detect two jobs running in parallel +touch ./flto_lockfile +sleep 2 +rm -f ./flto_lockfile + +exit 0 diff --git a/tests/test.sh b/tests/test.sh index 62203f3..323e62b 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -837,6 +837,31 @@ make_test() make -f Makefile.test OUTDIR="$testdir" clean -s } +serialized_flto_test() +{ + # check that running two link jobs with -flto=auto are not run at the same time + echo Running serialize flto test. + reset_logs "" "serialize flto test" + # use a dummy "compiler" that will wait for a while and then exits + ICECC_TEST_SOCKET="$testdir"/socket-localice ICECC_TEST_REMOTEBUILD=1 ICECC_DEBUG=debug ICECC_LOGFILE="$testdir"/icecc.log \ + make -f Makefile.flto OUTDIR="$testdir" CXX="${icecc} ./flto-g++" -j2 -s 2>>"$testdir"/stderr.log + if test $? -ne 0; then + echo Serialize flto test failed. + stop_ice 0 + abort_tests + fi + flush_logs + check_logs_for_generic_errors + check_everything_is_idle + + check_log_message_count icecc 2 "-flto=auto and no -c, building with all local slots" + check_log_message_count localice 2 "pushed full local job" + check_log_message_count scheduler 2 "handle_local_job (full)" + check_log_message_count scheduler 2 "handle_local_job_done" + echo Serialize flto test successful. + echo +} + # 1st argument, if set, means we run without scheduler icerun_serialize_test() { @@ -1881,7 +1906,7 @@ check_log_message_count() { log="$1" expected_count="$2" - count=$(cat_log_last_mark ${log} | grep "$3" | wc -l) + count=$(cat_log_last_mark ${log} | grep -- "$3" | wc -l) if test $count -ne $expected_count; then echo "Error, $log log does not contain expected count (${count} vs ${expected_count}): $3" stop_ice 0 @@ -1981,6 +2006,12 @@ echo run_ice "$testdir/plain.o" "remote" 0 $TESTCXX -Wall -Werror -c plain.cpp -o "$testdir/"plain.o +if test -z "$chroot_disabled"; then + serialized_flto_test +else + skipped_tests="$skipped_tests serialized_flto_test" +fi + run_ice "$testdir/plain.o" "remote" 0 $TESTCC -Wall -Werror -c plain.c -o "$testdir/"plain.o run_ice "$testdir/plain.o" "remote" 0 $TESTCXX -Wall -Werror -c plain.cpp -O2 -o "$testdir/"plain.o run_ice "$testdir/plain.ii" "local" 0 $TESTCXX -Wall -Werror -E plain.cpp -o "$testdir/"plain.ii @@ -2246,6 +2277,12 @@ else skipped_tests="$skipped_tests make_test" fi +if test -z "$chroot_disabled"; then + serialized_flto_test +else + skipped_tests="$skipped_tests serialized_flto_test" +fi + if test -z "$chroot_disabled"; then zero_local_jobs_test else -- 2.35.3 From 891b3f3b291f9455465ab7b2ba2016b62c064fda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz> Date: Sat, 7 May 2022 19:47:32 +0200 Subject: [PATCH 2/3] count also local jobs when selecting where to schedule a job Up until now, if a node was fully busy e.g. linking, the scheduler still considered it to have no jobs running and might have decided to schedule jobs to it. --- scheduler/compileserver.cpp | 30 ++++++++++++++++++++---------- scheduler/compileserver.h | 18 ++++++++++++++---- scheduler/scheduler.cpp | 24 ++++++++++++------------ 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/scheduler/compileserver.cpp b/scheduler/compileserver.cpp index 314544a..1353b76 100644 --- a/scheduler/compileserver.cpp +++ b/scheduler/compileserver.cpp @@ -61,7 +61,7 @@ CompileServer::CompileServer(const int fd, struct sockaddr *_addr, const socklen , m_lastRequestedJobs() , m_cumCompiled() , m_cumRequested() - , m_clientMap() + , m_clientLocalMap() , m_blacklist() , m_inFd(-1) , m_inConnAttempt(0) @@ -194,15 +194,17 @@ bool CompileServer::is_eligible_now(const Job *job) const { if(!is_eligible_ever(job)) return false; - bool jobs_okay = int(m_jobList.size()) < m_maxJobs; - if( m_maxJobs > 0 && int(m_jobList.size()) < m_maxJobs + maxPreloadCount()) + int jobs_now = currentJobCount(); + bool jobs_okay = jobs_now < m_maxJobs; + if( m_maxJobs > 0 && jobs_now < m_maxJobs + maxPreloadCount()) jobs_okay = true; // allow a job for preloading bool load_okay = m_load < 1000; bool eligible = jobs_okay && load_okay && can_install(job, false).size(); #if DEBUG_SCHEDULER > 2 - trace() << nodeName() << " is_eligible_now: " << eligible << " (jobs_okay " << jobs_okay + trace() << nodeName() << " is_eligible_now: " << eligible << " (remote jobs " << m_jobList.size() + << ", local jobs " << (currentJobCount() - m_jobList.size()) << ", jobs_okay " << jobs_okay << ", load_okay " << load_okay << ")" << endl; #endif return eligible; @@ -283,6 +285,14 @@ void CompileServer::setMaxJobs(int jobs) m_maxJobs = jobs; } +int CompileServer::currentJobCount() const +{ + int count = m_jobList.size(); + for( const std::pair<int, LocalJobInfo>& info : m_clientLocalMap ) + count += info.second.fulljob ? m_maxJobs : 1; + return count; +} + bool CompileServer::noRemote() const { return m_noRemote; @@ -444,19 +454,19 @@ void CompileServer::setCumRequested(const JobStat &stats) m_cumRequested = stats; } -int CompileServer::getClientJobId(const int localJobId) +int CompileServer::getClientLocalJobId(const int localJobId) { - return m_clientMap[localJobId]; + return m_clientLocalMap[localJobId].id; } -void CompileServer::insertClientJobId(const int localJobId, const int newJobId) +void CompileServer::insertClientLocalJobId(const int localJobId, const int newJobId, bool fulljob) { - m_clientMap[localJobId] = newJobId; + m_clientLocalMap[localJobId] = LocalJobInfo{newJobId, fulljob}; } -void CompileServer::eraseClientJobId(const int localJobId) +void CompileServer::eraseClientLocalJobId(const int localJobId) { - m_clientMap.erase(localJobId); + m_clientLocalMap.erase(localJobId); } map<const CompileServer *, Environments> CompileServer::blacklist() const diff --git a/scheduler/compileserver.h b/scheduler/compileserver.h index 69205d5..331de4f 100644 --- a/scheduler/compileserver.h +++ b/scheduler/compileserver.h @@ -84,6 +84,8 @@ public: int maxJobs() const; void setMaxJobs(const int jobs); int maxPreloadCount() const; + // Counts both remote and local jobs. + int currentJobCount() const; bool noRemote() const; void setNoRemote(const bool value); @@ -132,9 +134,9 @@ public: unsigned int hostidCounter() const; - int getClientJobId(const int localJobId); - void insertClientJobId(const int localJobId, const int newJobId); - void eraseClientJobId(const int localJobId); + int getClientLocalJobId(const int localJobId); + void insertClientLocalJobId(const int localJobId, const int newJobId, bool fulljob); + void eraseClientLocalJobId(const int localJobId); map<const CompileServer *, Environments> blacklist() const; Environments getEnvsForBlacklistedCS(const CompileServer *cs); @@ -180,7 +182,15 @@ private: JobStat m_cumRequested; static unsigned int s_hostIdCounter; - map<int, int> m_clientMap; // map client ID for daemon to our IDs + + // map client ID for daemon to our IDs + struct LocalJobInfo + { + int id; + bool fulljob; + }; + map<int, LocalJobInfo> m_clientLocalMap; + map<const CompileServer *, Environments> m_blacklist; int m_inFd; diff --git a/scheduler/scheduler.cpp b/scheduler/scheduler.cpp index a5f1a24..4521223 100644 --- a/scheduler/scheduler.cpp +++ b/scheduler/scheduler.cpp @@ -315,7 +315,7 @@ static float server_speed(CompileServer *cs, Job *job, bool blockDebug) * takes care of the fact that not all slots are equally fast on * CPUs with SMT and dynamic clock ramping. */ - f *= (1.0f - (0.5f * cs->jobList().size() / cs->maxJobs())); + f *= (1.0f - (0.5f * cs->currentJobCount() / cs->maxJobs())); } // below we add a pessimism factor - assuming the first job a computer got is not representative @@ -569,7 +569,7 @@ static bool handle_local_job(CompileServer *cs, Msg *_m) ++new_job_id; trace() << "handle_local_job " << (m->fulljob ? "(full) " : "") << m->outfile << " " << m->id << endl; - cs->insertClientJobId(m->id, new_job_id); + cs->insertClientLocalJobId(m->id, new_job_id, m->fulljob); notify_monitors(new MonLocalJobBeginMsg(new_job_id, m->outfile, m->stime, cs->hostId())); return true; } @@ -583,8 +583,8 @@ static bool handle_local_job_done(CompileServer *cs, Msg *_m) } trace() << "handle_local_job_done " << m->job_id << endl; - notify_monitors(new JobLocalDoneMsg(cs->getClientJobId(m->job_id))); - cs->eraseClientJobId(m->job_id); + notify_monitors(new JobLocalDoneMsg(cs->getClientLocalJobId(m->job_id))); + cs->eraseClientLocalJobId(m->job_id); return true; } @@ -706,8 +706,8 @@ static CompileServer *pick_server(Job *job) // Ignore ineligible servers if (!cs->is_eligible_now(job)) { #if DEBUG_SCHEDULER > 1 - if ((int(cs->jobList().size()) >= cs->maxJobs() + cs->maxPreloadCount()) || (cs->load() >= 1000)) { - trace() << "overloaded " << cs->nodeName() << " " << cs->jobList().size() << "/" + if ((cs->currentJobCount() >= cs->maxJobs() + cs->maxPreloadCount()) || (cs->load() >= 1000)) { + trace() << "overloaded " << cs->nodeName() << " " << cs->currentJobCount() << "/" << cs->maxJobs() << " jobs, load:" << cs->load() << endl; } else trace() << cs->nodeName() << " not eligible" << endl; @@ -738,12 +738,12 @@ static CompileServer *pick_server(Job *job) #if DEBUG_SCHEDULER > 1 trace() << cs->nodeName() << " compiled " << cs->lastCompiledJobs().size() << " got now: " << - cs->jobList().size() << " speed: " << server_speed(cs, job, true) << " compile time " << + cs->currentJobCount() << " speed: " << server_speed(cs, job, true) << " compile time " << cs->cumCompiled().compileTimeUser() << " produced code " << cs->cumCompiled().outputSize() << " client count: " << cs->clientCount() << endl; #endif - if ((cs->lastCompiledJobs().size() == 0) && (cs->jobList().size() == 0) && cs->maxJobs()) { + if ((cs->lastCompiledJobs().size() == 0) && (cs->currentJobCount() == 0) && cs->maxJobs()) { /* Make all servers compile a job at least once, so we'll get an idea about their speed. */ if (!envs_match(cs, job).empty()) { @@ -775,7 +775,7 @@ static CompileServer *pick_server(Job *job) the job. (XXX currently this is equivalent to the fastest one) */ else if ((best->lastCompiledJobs().size() != 0) && (server_speed(best, job) < server_speed(cs, job))) { - if (int(cs->jobList().size()) < cs->maxJobs()) { + if (cs->currentJobCount() < cs->maxJobs()) { best = cs; } else { bestpre = cs; @@ -791,7 +791,7 @@ static CompileServer *pick_server(Job *job) the job. (XXX currently this is equivalent to the fastest one) */ else if ((bestui->lastCompiledJobs().size() != 0) && (server_speed(bestui, job) < server_speed(cs, job))) { - if (int(cs->jobList().size()) < cs->maxJobs()) { + if (cs->currentJobCount() < cs->maxJobs()) { bestui = cs; } else { bestpre = cs; @@ -931,7 +931,7 @@ static bool empty_queue() /* Ignore the load on the submitter itself if no other host could be found. We only obey to its max job number. */ use_cs = job->submitter(); - if ((int(use_cs->jobList().size()) < use_cs->maxJobs()) + if ((use_cs->currentJobCount() < use_cs->maxJobs()) && job->preferredHost().empty() /* This should be trivially true. */ && use_cs->can_install(job).size()) { @@ -1487,7 +1487,7 @@ static bool handle_line(CompileServer *cs, Msg *_m) line = " " + it->nodeName() + buffer; line += "[" + it->hostPlatform() + "] speed="; sprintf(buffer, "%.2f jobs=%d/%d load=%u", server_speed(it), - (int)it->jobList().size(), it->maxJobs(), it->load()); + it->currentJobCount(), it->maxJobs(), it->load()); line += buffer; if (it->busyInstalling()) { -- 2.35.3 From 57fd7f1441f8c484af3e229fb2e4ca2646487427 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz> Date: Sat, 7 May 2022 19:56:11 +0200 Subject: [PATCH 3/3] don't allow job preloading if fully busy with local jobs If a node is fully busy with local jobs, that may mean it's linking, which may take some time (especially if using LTO), or it may be also icerun that serializes some tasks, which may also take some time. --- scheduler/compileserver.cpp | 25 +++++++++++++++++++------ scheduler/compileserver.h | 3 ++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/scheduler/compileserver.cpp b/scheduler/compileserver.cpp index 1353b76..e1216bc 100644 --- a/scheduler/compileserver.cpp +++ b/scheduler/compileserver.cpp @@ -194,10 +194,13 @@ bool CompileServer::is_eligible_now(const Job *job) const { if(!is_eligible_ever(job)) return false; - int jobs_now = currentJobCount(); - bool jobs_okay = jobs_now < m_maxJobs; - if( m_maxJobs > 0 && jobs_now < m_maxJobs + maxPreloadCount()) - jobs_okay = true; // allow a job for preloading + int local_jobs_now = currentJobCountLocal(); + int jobs_now = local_jobs_now + currentJobCountRemote(); + bool jobs_okay = jobs_now < m_maxJobs; + // allow a job for preloading, but only if the node isn't fully + // busy with local jobs (that may possibly take long) + if( m_maxJobs > 0 && jobs_now < m_maxJobs + maxPreloadCount() && local_jobs_now < m_maxJobs) + jobs_okay = true; bool load_okay = m_load < 1000; bool eligible = jobs_okay && load_okay @@ -285,14 +288,24 @@ void CompileServer::setMaxJobs(int jobs) m_maxJobs = jobs; } -int CompileServer::currentJobCount() const +int CompileServer::currentJobCountRemote() const +{ + return m_jobList.size(); +} + +int CompileServer::currentJobCountLocal() const { - int count = m_jobList.size(); + int count = 0; for( const std::pair<int, LocalJobInfo>& info : m_clientLocalMap ) count += info.second.fulljob ? m_maxJobs : 1; return count; } +int CompileServer::currentJobCount() const +{ + return currentJobCountRemote() + currentJobCountLocal(); +} + bool CompileServer::noRemote() const { return m_noRemote; diff --git a/scheduler/compileserver.h b/scheduler/compileserver.h index 331de4f..437445c 100644 --- a/scheduler/compileserver.h +++ b/scheduler/compileserver.h @@ -84,8 +84,9 @@ public: int maxJobs() const; void setMaxJobs(const int jobs); int maxPreloadCount() const; - // Counts both remote and local jobs. int currentJobCount() const; + int currentJobCountRemote() const; + int currentJobCountLocal() const; bool noRemote() const; void setNoRemote(const bool value); -- 2.35.3 From 116ed5379a21586f47a0f4d855343a5db5398af0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz> Date: Mon, 9 May 2022 08:01:21 +0200 Subject: [PATCH] use larger timeout for "full" local jobs When doing several LTO linking jobs, each of them may take quite a long time, and it's not wanted that others time out and start running too while one is already active. --- client/main.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/client/main.cpp b/client/main.cpp index fdf4ac3..e4939ba 100644 --- a/client/main.cpp +++ b/client/main.cpp @@ -593,8 +593,9 @@ int main(int argc, char **argv) /* Inform the daemon that we like to start a job. */ if (local_daemon->send_msg(JobLocalBeginMsg(0, get_absfilename(job.outputFile()),fulljob))) { /* Now wait until the daemon gives us the start signal. 40 minutes - should be enough for all normal compile or link jobs. */ - startme = local_daemon->get_msg(40 * 60); + should be enough for all normal compile or link jobs, but with expensive jobs + (which fulljobs may likely be, e.g. LTO linking) use an even larger timeout. */ + startme = local_daemon->get_msg(fulljob ? 120 * 60 : 40 * 60); } /* If we can't talk to the daemon anymore we need to fall back -- 2.35.3
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor