File l-flto-all-slots.patch of Package icecream

Overview Repositories Revisions Requests Users Attributes Meta

File l-flto-all-slots.patch of Package icecream

From e7941ae0b016862100d973c66c053f1bc61c27df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
Date: Sat, 7 May 2022 17:57:03 +0200
Subject: [PATCH 1/3] make parallel -flto linking jobs reserve all local slots

If linking uses parallel LTO, then it doesn't make sense to run
more than one at the same time (they'd use more memory,
sometimes causing OOM, and they'd compete for the CPU). So
make such "full" local jobs reserve all local slots while running.
---
 client/arg.cpp          | 22 ++++++++++++++++++++--
 client/client.h         | 11 +++++++++--
 client/main.cpp         |  9 +++++++--
 daemon/main.cpp         | 37 ++++++++++++++++++++++++++++---------
 scheduler/scheduler.cpp |  3 ++-
 services/comm.cpp       |  8 ++++++++
 services/comm.h         |  9 ++++++---
 tests/Makefile.flto     | 13 +++++++++++++
 tests/flto-g++          | 15 +++++++++++++++
 tests/test.sh           | 39 ++++++++++++++++++++++++++++++++++++++-
 10 files changed, 146 insertions(+), 20 deletions(-)
 create mode 100644 tests/Makefile.flto
 create mode 100755 tests/flto-g++

diff --git a/client/arg.cpp b/client/arg.cpp
index 3a26dc4..260209d 100644
--- a/client/arg.cpp
+++ b/client/arg.cpp
@@ -286,7 +286,7 @@ static bool analyze_assembler_arg(string &arg, list<string> *extrafiles)
 }
 
 
-bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<string> *extrafiles)
+int analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<string> *extrafiles)
 {
     ArgumentsList args;
     string ofile;
@@ -315,6 +315,7 @@ bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<s
     bool seen_march_native = false;
     bool seen_mcpu_native = false;
     bool seen_mtune_native = false;
+    std::string seen_parallel_flto;
     const char *standard = nullptr;
     // if rewriting includes and precompiling on remote machine, then cpp args are not local
     Argument_Type Arg_Cpp = compiler_only_rewrite_includes(job) ? Arg_Rest : Arg_Local;
@@ -709,6 +710,15 @@ bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<s
                 if (argv[i + 1]) {
                     args.append(argv[++i], Arg_Rest);
                 }
+            } else if (str_startswith("-flto", a)) {
+                args.append(a, Arg_Rest);
+                // If -flto will be parallel (and thus use all cores), make it a "full" job
+                // so that it reserves the entire local node. The only non-parallel -flto
+                // options appear to be GCC's -flto without arguments or -flto=1.
+                if( compiler_is_clang(job))
+                    seen_parallel_flto = a;
+                else if( !str_equal("-flto", a) && !str_equal("-flto=1", a))
+                    seen_parallel_flto = a;
             } else {
                 args.append(a, Arg_Rest);
 
@@ -946,5 +956,13 @@ bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun, list<s
             << endl;
 #endif
 
-    return always_local;
+    int ret = 0;
+    if( always_local ) {
+        ret |= AlwaysLocal;
+        if( !seen_parallel_flto.empty() && !seen_c ) {
+            ret |= FullJob;
+            trace() << seen_parallel_flto << " and no -c, building with all local slots" << endl;
+        }
+    }
+    return ret;
 }
diff --git a/client/client.h b/client/client.h
index cbaf73d..a3b05a8 100644
--- a/client/client.h
+++ b/client/client.h
@@ -43,9 +43,16 @@ extern std::string remote_daemon;
 /* in remote.cpp */
 extern std::string get_absfilename(const std::string &_file);
 
+enum RunFlags
+{
+    None        = 0,
+    AlwaysLocal = 1 << 0,  // The job should be built locally.
+    FullJob      = 1 << 1   // The job should reserve all slots (if AlwaysLocal).
+};
 /* In arg.cpp.  */
-extern bool analyse_argv(const char * const *argv, CompileJob &job, bool icerun,
-                         std::list<std::string> *extrafiles);
+// Returns RunFlags or-ed.
+extern int analyse_argv(const char * const *argv, CompileJob &job, bool icerun,
+                        std::list<std::string> *extrafiles);
 
 /* In cpp.cpp.  */
 extern pid_t call_cpp(CompileJob &job, int fdwrite, int fdread = -1);
diff --git a/client/main.cpp b/client/main.cpp
index a08828d..fdf4ac3 100644
--- a/client/main.cpp
+++ b/client/main.cpp
@@ -403,7 +403,12 @@ int main(int argc, char **argv)
     }
 
     list<string> extrafiles;
-    local |= analyse_argv(argv, job, icerun, &extrafiles);
+    bool fulljob = false;
+    int argv_result = analyse_argv(argv, job, icerun, &extrafiles);
+    if( argv_result & AlwaysLocal ) {
+        local = true;
+        fulljob = argv_result & FullJob;
+    }
 
     /* If ICECC is set to disable, then run job locally, without contacting
        the daemon at all. File-based locking will still ensure that all
@@ -586,7 +591,7 @@ int main(int argc, char **argv)
         Msg *startme = nullptr;
 
         /* Inform the daemon that we like to start a job.  */
-        if (local_daemon->send_msg(JobLocalBeginMsg(0, get_absfilename(job.outputFile())))) {
+        if (local_daemon->send_msg(JobLocalBeginMsg(0, get_absfilename(job.outputFile()),fulljob))) {
             /* Now wait until the daemon gives us the start signal.  40 minutes
                should be enough for all normal compile or link jobs.  */
             startme = local_daemon->get_msg(40 * 60);
diff --git a/daemon/main.cpp b/daemon/main.cpp
index a893b31..75d540f 100644
--- a/daemon/main.cpp
+++ b/daemon/main.cpp
@@ -141,6 +141,7 @@ public:
         pipe_from_child = -1;
         pipe_to_child = -1;
         child_pid = -1;
+        fulljob = false;
     }
 
     static string status_str(Status status) {
@@ -209,6 +210,7 @@ public:
     // pipe to child process, only valid if TOINSTALL/WAITINSTALL
     int pipe_to_child;
     pid_t child_pid;
+    bool fulljob; // during LINKJOB and CLIENTWORK, reserve all slots if set
     string pending_create_env; // only for WAITCREATEENV
 
     string dump() const {
@@ -216,7 +218,8 @@ public:
 
         switch (status) {
         case LINKJOB:
-            return ret + " ClientID: " + toString(client_id) + " " + outfile + " PID: " + toString(child_pid);
+            return ret + " ClientID: " + toString(client_id) + " " + outfile + (fulljob ? " (full)" : "")
+                + " PID: " + toString(child_pid);
         case TOINSTALL:
         case WAITINSTALL:
             return ret + " ClientID: " + toString(client_id) + " " + outfile + " PID: " + toString(child_pid);
@@ -1510,12 +1513,17 @@ bool Daemon::create_env_finished(string env_key)
 bool Daemon::handle_job_done(Client *cl, JobDoneMsg *m)
 {
     if (cl->status == Client::CLIENTWORK) {
-        clients.active_processes--;
+        if(cl->fulljob)
+            clients.active_processes -= std::max((unsigned int)1, max_kids);
+        else
+            clients.active_processes--;
     }
 
     cl->status = Client::JOBDONE;
     JobDoneMsg *msg = static_cast<JobDoneMsg *>(m);
-    trace() << "handle_job_done " << msg->job_id << " " << msg->exitcode << endl;
+    trace() << "handle_job_done " << msg->job_id << " " << (cl->fulljob ? "(full) " : "")
+        << msg->exitcode << endl;
+    cl->fulljob = false;
 
     if (!m->is_from_server()
             && (m->user_msec + m->sys_msec) <= m->real_msec) {
@@ -1544,10 +1552,15 @@ void Daemon::handle_old_request()
                 handle_end(client, 112);
             } else {
                 client->status = Client::CLIENTWORK;
-                clients.active_processes++;
-                trace() << "pushed local job " << client->client_id << endl;
-
-                if (!send_scheduler(JobLocalBeginMsg(client->client_id, client->outfile))) {
+                if(client->fulljob) { // reserve the entire node
+                    clients.active_processes += std::max((unsigned int)1, max_kids);
+                    trace() << "pushed full local job " << client->client_id << endl;
+                } else {
+                    clients.active_processes++;
+                    trace() << "pushed local job " << client->client_id << endl;
+                }
+                if (!send_scheduler(JobLocalBeginMsg(client->client_id, client->outfile,
+                        client->fulljob))) {
                     return;
                 }
             }
@@ -1723,8 +1736,12 @@ void Daemon::handle_end(Client *client, int exitcode)
     }
 
     if (client->status == Client::CLIENTWORK) {
-        clients.active_processes--;
+        if(client->fulljob)
+            clients.active_processes -= std::max((unsigned int)1, max_kids);
+        else
+            clients.active_processes--;
     }
+    client->fulljob = false;
 
     if (client->status == Client::WAITCOMPILE && exitcode == 119) {
         /* the client sent us a real good bye, so forget about the scheduler */
@@ -1861,8 +1878,10 @@ int Daemon::handle_cs_conf(ConfCSMsg *msg)
 
 bool Daemon::handle_local_job(Client *client, Msg *msg)
 {
+    JobLocalBeginMsg* m = dynamic_cast<JobLocalBeginMsg *>(msg);
     client->status = Client::LINKJOB;
-    client->outfile = dynamic_cast<JobLocalBeginMsg *>(msg)->outfile;
+    client->outfile = m->outfile;
+    client->fulljob = m->fulljob;
     return true;
 }
 
diff --git a/scheduler/scheduler.cpp b/scheduler/scheduler.cpp
index b1ea334..a5f1a24 100644
--- a/scheduler/scheduler.cpp
+++ b/scheduler/scheduler.cpp
@@ -567,7 +567,8 @@ static bool handle_local_job(CompileServer *cs, Msg *_m)
     }
 
     ++new_job_id;
-    trace() << "handle_local_job " << m->outfile << " " << m->id << endl;
+    trace() << "handle_local_job " << (m->fulljob ? "(full) " : "") << m->outfile
+        << " " << m->id << endl;
     cs->insertClientJobId(m->id, new_job_id);
     notify_monitors(new MonLocalJobBeginMsg(new_job_id, m->outfile, m->stime, cs->hostId()));
     return true;
diff --git a/services/comm.cpp b/services/comm.cpp
index ad53393..5332f7d 100644
--- a/services/comm.cpp
+++ b/services/comm.cpp
@@ -2281,6 +2281,11 @@ void JobLocalBeginMsg::fill_from_channel(MsgChannel *c)
     *c >> stime;
     *c >> outfile;
     *c >> id;
+    if (IS_PROTOCOL_44(c)) {
+        uint32_t full;
+        *c >> full;
+        fulljob = full;
+    }
 }
 
 void JobLocalBeginMsg::send_to_channel(MsgChannel *c) const
@@ -2289,6 +2294,9 @@ void JobLocalBeginMsg::send_to_channel(MsgChannel *c) const
     *c << stime;
     *c << outfile;
     *c << id;
+    if (IS_PROTOCOL_44(c)) {
+        *c << (uint32_t) fulljob;
+    }
 }
 
 void JobLocalDoneMsg::fill_from_channel(MsgChannel *c)
diff --git a/services/comm.h b/services/comm.h
index 48258b0..3d5fa08 100644
--- a/services/comm.h
+++ b/services/comm.h
@@ -36,7 +36,7 @@
 #include "job.h"
 
 // if you increase the PROTOCOL_VERSION, add a macro below and use that
-#define PROTOCOL_VERSION 43
+#define PROTOCOL_VERSION 44
 // if you increase the MIN_PROTOCOL_VERSION, comment out macros below and clean up the code
 #define MIN_PROTOCOL_VERSION 21
 
@@ -68,6 +68,7 @@
 #define IS_PROTOCOL_41(c) ((c)->protocol >= 41)
 #define IS_PROTOCOL_42(c) ((c)->protocol >= 42)
 #define IS_PROTOCOL_43(c) ((c)->protocol >= 43)
+#define IS_PROTOCOL_44(c) ((c)->protocol >= 44)
 
 // Terms used:
 // S  = scheduler
@@ -685,11 +686,12 @@ public:
 class JobLocalBeginMsg : public Msg
 {
 public:
-    JobLocalBeginMsg(int job_id = 0, const std::string &file = "")
+    JobLocalBeginMsg(int job_id = 0, const std::string &file = "", bool full = false)
         : Msg(M_JOB_LOCAL_BEGIN)
         , outfile(file)
         , stime(time(0))
-        , id(job_id) {}
+        , id(job_id)
+        , fulljob(full) {}
 
     virtual void fill_from_channel(MsgChannel *c);
     virtual void send_to_channel(MsgChannel *c) const;
@@ -697,6 +699,7 @@ public:
     std::string outfile;
     uint32_t stime;
     uint32_t id;
+    bool fulljob;
 };
 
 class JobLocalDoneMsg : public Msg
diff --git a/tests/Makefile.flto b/tests/Makefile.flto
new file mode 100644
index 0000000..634df2c
--- /dev/null
+++ b/tests/Makefile.flto
@@ -0,0 +1,13 @@
+all: flto1 flto2
+
+# $(CXX) will be flto-g++ dummy, the only thing that matters is that there's no -c
+# and that there is -flto=auto
+flto1: prepare_lock
+	$(CXX) a1.cpp -flto=auto
+
+flto2: prepare_lock
+	$(CXX) a2.cpp -flto=auto
+
+# make sure there's no lockfile at the start
+prepare_lock:
+	rm -f flto_lockfile
diff --git a/tests/flto-g++ b/tests/flto-g++
new file mode 100755
index 0000000..bab881e
--- /dev/null
+++ b/tests/flto-g++
@@ -0,0 +1,15 @@
+#! /bin/sh
+
+# the lockfile has been removed by the Makefile, so if it exists,
+# it means another job is running in parallel -> fail
+if test -f ./flto_lockfile; then
+    echo flto-g++ found a lock file >&2
+    exit 1
+fi
+
+# create a lockfile while running to detect two jobs running in parallel
+touch ./flto_lockfile
+sleep 2
+rm -f ./flto_lockfile
+
+exit 0
diff --git a/tests/test.sh b/tests/test.sh
index 62203f3..323e62b 100755
--- a/tests/test.sh
+++ b/tests/test.sh
@@ -837,6 +837,31 @@ make_test()
     make -f Makefile.test OUTDIR="$testdir" clean -s
 }
 
+serialized_flto_test()
+{
+    # check that running two link jobs with -flto=auto are not run at the same time
+    echo Running serialize flto test.
+    reset_logs "" "serialize flto test"
+    # use a dummy "compiler" that will wait for a while and then exits
+    ICECC_TEST_SOCKET="$testdir"/socket-localice ICECC_TEST_REMOTEBUILD=1 ICECC_DEBUG=debug ICECC_LOGFILE="$testdir"/icecc.log \
+        make -f Makefile.flto OUTDIR="$testdir" CXX="${icecc} ./flto-g++" -j2 -s 2>>"$testdir"/stderr.log
+    if test $? -ne 0; then
+        echo Serialize flto test failed.
+        stop_ice 0
+        abort_tests
+    fi
+    flush_logs
+    check_logs_for_generic_errors
+    check_everything_is_idle
+
+    check_log_message_count icecc 2 "-flto=auto and no -c, building with all local slots"
+    check_log_message_count localice 2 "pushed full local job"
+    check_log_message_count scheduler 2 "handle_local_job (full)"
+    check_log_message_count scheduler 2 "handle_local_job_done"
+    echo Serialize flto test successful.
+    echo
+}
+
 # 1st argument, if set, means we run without scheduler
 icerun_serialize_test()
 {
@@ -1881,7 +1906,7 @@ check_log_message_count()
 {
     log="$1"
     expected_count="$2"
-    count=$(cat_log_last_mark ${log} | grep "$3" | wc -l)
+    count=$(cat_log_last_mark ${log} | grep -- "$3" | wc -l)
     if test $count -ne $expected_count; then
         echo "Error, $log log does not contain expected count (${count} vs ${expected_count}): $3"
         stop_ice 0
@@ -1981,6 +2006,12 @@ echo
 
 run_ice "$testdir/plain.o" "remote" 0 $TESTCXX -Wall -Werror -c plain.cpp -o "$testdir/"plain.o
 
+if test -z "$chroot_disabled"; then
+    serialized_flto_test
+else
+    skipped_tests="$skipped_tests serialized_flto_test"
+fi
+
 run_ice "$testdir/plain.o" "remote" 0 $TESTCC -Wall -Werror -c plain.c -o "$testdir/"plain.o
 run_ice "$testdir/plain.o" "remote" 0 $TESTCXX -Wall -Werror -c plain.cpp -O2 -o "$testdir/"plain.o
 run_ice "$testdir/plain.ii" "local" 0 $TESTCXX -Wall -Werror -E plain.cpp -o "$testdir/"plain.ii
@@ -2246,6 +2277,12 @@ else
     skipped_tests="$skipped_tests make_test"
 fi
 
+if test -z "$chroot_disabled"; then
+    serialized_flto_test
+else
+    skipped_tests="$skipped_tests serialized_flto_test"
+fi
+
 if test -z "$chroot_disabled"; then
     zero_local_jobs_test
 else
-- 
2.35.3

From 891b3f3b291f9455465ab7b2ba2016b62c064fda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
Date: Sat, 7 May 2022 19:47:32 +0200
Subject: [PATCH 2/3] count also local jobs when selecting where to schedule a
 job

Up until now, if a node was fully busy e.g. linking, the scheduler
still considered it to have no jobs running and might have decided
to schedule jobs to it.
---
 scheduler/compileserver.cpp | 30 ++++++++++++++++++++----------
 scheduler/compileserver.h   | 18 ++++++++++++++----
 scheduler/scheduler.cpp     | 24 ++++++++++++------------
 3 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/scheduler/compileserver.cpp b/scheduler/compileserver.cpp
index 314544a..1353b76 100644
--- a/scheduler/compileserver.cpp
+++ b/scheduler/compileserver.cpp
@@ -61,7 +61,7 @@ CompileServer::CompileServer(const int fd, struct sockaddr *_addr, const socklen
     , m_lastRequestedJobs()
     , m_cumCompiled()
     , m_cumRequested()
-    , m_clientMap()
+    , m_clientLocalMap()
     , m_blacklist()
     , m_inFd(-1)
     , m_inConnAttempt(0)
@@ -194,15 +194,17 @@ bool CompileServer::is_eligible_now(const Job *job) const
 {
     if(!is_eligible_ever(job))
         return false;
-    bool jobs_okay = int(m_jobList.size()) < m_maxJobs;
-    if( m_maxJobs > 0 && int(m_jobList.size()) < m_maxJobs + maxPreloadCount())
+    int jobs_now = currentJobCount();
+    bool jobs_okay =  jobs_now < m_maxJobs;
+    if( m_maxJobs > 0 && jobs_now < m_maxJobs + maxPreloadCount())
         jobs_okay = true; // allow a job for preloading
     bool load_okay = m_load < 1000;
     bool eligible = jobs_okay
                     && load_okay
                     && can_install(job, false).size();
 #if DEBUG_SCHEDULER > 2
-    trace() << nodeName() << " is_eligible_now: " << eligible << " (jobs_okay " << jobs_okay
+    trace() << nodeName() << " is_eligible_now: " << eligible << " (remote jobs " << m_jobList.size()
+        << ", local jobs " << (currentJobCount() - m_jobList.size()) << ", jobs_okay " << jobs_okay
         << ", load_okay " << load_okay << ")" << endl;
 #endif
     return eligible;
@@ -283,6 +285,14 @@ void CompileServer::setMaxJobs(int jobs)
     m_maxJobs = jobs;
 }
 
+int CompileServer::currentJobCount() const
+{
+    int count = m_jobList.size();
+    for( const std::pair<int, LocalJobInfo>& info : m_clientLocalMap )
+        count += info.second.fulljob ? m_maxJobs : 1;
+    return count;
+}
+
 bool CompileServer::noRemote() const
 {
     return m_noRemote;
@@ -444,19 +454,19 @@ void CompileServer::setCumRequested(const JobStat &stats)
     m_cumRequested = stats;
 }
 
-int CompileServer::getClientJobId(const int localJobId)
+int CompileServer::getClientLocalJobId(const int localJobId)
 {
-    return m_clientMap[localJobId];
+    return m_clientLocalMap[localJobId].id;
 }
 
-void CompileServer::insertClientJobId(const int localJobId, const int newJobId)
+void CompileServer::insertClientLocalJobId(const int localJobId, const int newJobId, bool fulljob)
 {
-    m_clientMap[localJobId] = newJobId;
+    m_clientLocalMap[localJobId] = LocalJobInfo{newJobId, fulljob};
 }
 
-void CompileServer::eraseClientJobId(const int localJobId)
+void CompileServer::eraseClientLocalJobId(const int localJobId)
 {
-    m_clientMap.erase(localJobId);
+    m_clientLocalMap.erase(localJobId);
 }
 
 map<const CompileServer *, Environments> CompileServer::blacklist() const
diff --git a/scheduler/compileserver.h b/scheduler/compileserver.h
index 69205d5..331de4f 100644
--- a/scheduler/compileserver.h
+++ b/scheduler/compileserver.h
@@ -84,6 +84,8 @@ public:
     int maxJobs() const;
     void setMaxJobs(const int jobs);
     int maxPreloadCount() const;
+    // Counts both remote and local jobs.
+    int currentJobCount() const;
 
     bool noRemote() const;
     void setNoRemote(const bool value);
@@ -132,9 +134,9 @@ public:
 
     unsigned int hostidCounter() const;
 
-    int getClientJobId(const int localJobId);
-    void insertClientJobId(const int localJobId, const int newJobId);
-    void eraseClientJobId(const int localJobId);
+    int getClientLocalJobId(const int localJobId);
+    void insertClientLocalJobId(const int localJobId, const int newJobId, bool fulljob);
+    void eraseClientLocalJobId(const int localJobId);
 
     map<const CompileServer *, Environments> blacklist() const;
     Environments getEnvsForBlacklistedCS(const CompileServer *cs);
@@ -180,7 +182,15 @@ private:
     JobStat m_cumRequested;
 
     static unsigned int s_hostIdCounter;
-    map<int, int> m_clientMap; // map client ID for daemon to our IDs
+
+    // map client ID for daemon to our IDs
+    struct LocalJobInfo
+    {
+        int id;
+        bool fulljob;
+    };
+    map<int, LocalJobInfo> m_clientLocalMap;
+
     map<const CompileServer *, Environments> m_blacklist;
 
     int m_inFd;
diff --git a/scheduler/scheduler.cpp b/scheduler/scheduler.cpp
index a5f1a24..4521223 100644
--- a/scheduler/scheduler.cpp
+++ b/scheduler/scheduler.cpp
@@ -315,7 +315,7 @@ static float server_speed(CompileServer *cs, Job *job, bool blockDebug)
              * takes care of the fact that not all slots are equally fast on
              * CPUs with SMT and dynamic clock ramping.
              */
-            f *= (1.0f - (0.5f * cs->jobList().size() / cs->maxJobs()));
+            f *= (1.0f - (0.5f * cs->currentJobCount() / cs->maxJobs()));
         }
 
         // below we add a pessimism factor - assuming the first job a computer got is not representative
@@ -569,7 +569,7 @@ static bool handle_local_job(CompileServer *cs, Msg *_m)
     ++new_job_id;
     trace() << "handle_local_job " << (m->fulljob ? "(full) " : "") << m->outfile
         << " " << m->id << endl;
-    cs->insertClientJobId(m->id, new_job_id);
+    cs->insertClientLocalJobId(m->id, new_job_id, m->fulljob);
     notify_monitors(new MonLocalJobBeginMsg(new_job_id, m->outfile, m->stime, cs->hostId()));
     return true;
 }
@@ -583,8 +583,8 @@ static bool handle_local_job_done(CompileServer *cs, Msg *_m)
     }
 
     trace() << "handle_local_job_done " << m->job_id << endl;
-    notify_monitors(new JobLocalDoneMsg(cs->getClientJobId(m->job_id)));
-    cs->eraseClientJobId(m->job_id);
+    notify_monitors(new JobLocalDoneMsg(cs->getClientLocalJobId(m->job_id)));
+    cs->eraseClientLocalJobId(m->job_id);
     return true;
 }
 
@@ -706,8 +706,8 @@ static CompileServer *pick_server(Job *job)
         // Ignore ineligible servers
         if (!cs->is_eligible_now(job)) {
 #if DEBUG_SCHEDULER > 1
-            if ((int(cs->jobList().size()) >= cs->maxJobs() + cs->maxPreloadCount()) || (cs->load() >= 1000)) {
-                trace() << "overloaded " << cs->nodeName() << " " << cs->jobList().size() << "/"
+            if ((cs->currentJobCount() >= cs->maxJobs() + cs->maxPreloadCount()) || (cs->load() >= 1000)) {
+                trace() << "overloaded " << cs->nodeName() << " " << cs->currentJobCount() << "/"
                         <<  cs->maxJobs() << " jobs, load:" << cs->load() << endl;
             } else
                 trace() << cs->nodeName() << " not eligible" << endl;
@@ -738,12 +738,12 @@ static CompileServer *pick_server(Job *job)
 
 #if DEBUG_SCHEDULER > 1
         trace() << cs->nodeName() << " compiled " << cs->lastCompiledJobs().size() << " got now: " <<
-                cs->jobList().size() << " speed: " << server_speed(cs, job, true) << " compile time " <<
+                cs->currentJobCount() << " speed: " << server_speed(cs, job, true) << " compile time " <<
                 cs->cumCompiled().compileTimeUser() << " produced code " << cs->cumCompiled().outputSize() <<
                 " client count: " << cs->clientCount() << endl;
 #endif
 
-        if ((cs->lastCompiledJobs().size() == 0) && (cs->jobList().size() == 0) && cs->maxJobs()) {
+        if ((cs->lastCompiledJobs().size() == 0) && (cs->currentJobCount() == 0) && cs->maxJobs()) {
             /* Make all servers compile a job at least once, so we'll get an
                idea about their speed.  */
             if (!envs_match(cs, job).empty()) {
@@ -775,7 +775,7 @@ static CompileServer *pick_server(Job *job)
                the job.  (XXX currently this is equivalent to the fastest one)  */
             else if ((best->lastCompiledJobs().size() != 0)
                      && (server_speed(best, job) < server_speed(cs, job))) {
-                if (int(cs->jobList().size()) < cs->maxJobs()) {
+                if (cs->currentJobCount() < cs->maxJobs()) {
                     best = cs;
                 } else {
                     bestpre = cs;
@@ -791,7 +791,7 @@ static CompileServer *pick_server(Job *job)
                the job.  (XXX currently this is equivalent to the fastest one)  */
             else if ((bestui->lastCompiledJobs().size() != 0)
                      && (server_speed(bestui, job) < server_speed(cs, job))) {
-                if (int(cs->jobList().size()) < cs->maxJobs()) {
+                if (cs->currentJobCount() < cs->maxJobs()) {
                     bestui = cs;
                 } else {
                     bestpre = cs;
@@ -931,7 +931,7 @@ static bool empty_queue()
         /* Ignore the load on the submitter itself if no other host could
            be found.  We only obey to its max job number.  */
         use_cs = job->submitter();
-        if ((int(use_cs->jobList().size()) < use_cs->maxJobs())
+        if ((use_cs->currentJobCount() < use_cs->maxJobs())
                 && job->preferredHost().empty()
                 /* This should be trivially true.  */
                 && use_cs->can_install(job).size()) {
@@ -1487,7 +1487,7 @@ static bool handle_line(CompileServer *cs, Msg *_m)
             line = " " + it->nodeName() + buffer;
             line += "[" + it->hostPlatform() + "] speed=";
             sprintf(buffer, "%.2f jobs=%d/%d load=%u", server_speed(it),
-                    (int)it->jobList().size(), it->maxJobs(), it->load());
+                    it->currentJobCount(), it->maxJobs(), it->load());
             line += buffer;
 
             if (it->busyInstalling()) {
-- 
2.35.3

From 57fd7f1441f8c484af3e229fb2e4ca2646487427 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
Date: Sat, 7 May 2022 19:56:11 +0200
Subject: [PATCH 3/3] don't allow job preloading if fully busy with local jobs

If a node is fully busy with local jobs, that may mean it's linking,
which may take some time (especially if using LTO), or it may
be also icerun that serializes some tasks, which may also take some
time.
---
 scheduler/compileserver.cpp | 25 +++++++++++++++++++------
 scheduler/compileserver.h   |  3 ++-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/scheduler/compileserver.cpp b/scheduler/compileserver.cpp
index 1353b76..e1216bc 100644
--- a/scheduler/compileserver.cpp
+++ b/scheduler/compileserver.cpp
@@ -194,10 +194,13 @@ bool CompileServer::is_eligible_now(const Job *job) const
 {
     if(!is_eligible_ever(job))
         return false;
-    int jobs_now = currentJobCount();
-    bool jobs_okay =  jobs_now < m_maxJobs;
-    if( m_maxJobs > 0 && jobs_now < m_maxJobs + maxPreloadCount())
-        jobs_okay = true; // allow a job for preloading
+    int local_jobs_now = currentJobCountLocal();
+    int jobs_now = local_jobs_now + currentJobCountRemote();
+    bool jobs_okay = jobs_now < m_maxJobs;
+    // allow a job for preloading, but only if the node isn't fully
+    // busy with local jobs (that may possibly take long)
+    if( m_maxJobs > 0 && jobs_now < m_maxJobs + maxPreloadCount() && local_jobs_now < m_maxJobs)
+        jobs_okay = true;
     bool load_okay = m_load < 1000;
     bool eligible = jobs_okay
                     && load_okay
@@ -285,14 +288,24 @@ void CompileServer::setMaxJobs(int jobs)
     m_maxJobs = jobs;
 }
 
-int CompileServer::currentJobCount() const
+int CompileServer::currentJobCountRemote() const
+{
+    return m_jobList.size();
+}
+
+int CompileServer::currentJobCountLocal() const
 {
-    int count = m_jobList.size();
+    int count = 0;
     for( const std::pair<int, LocalJobInfo>& info : m_clientLocalMap )
         count += info.second.fulljob ? m_maxJobs : 1;
     return count;
 }
 
+int CompileServer::currentJobCount() const
+{
+    return currentJobCountRemote() + currentJobCountLocal();
+}
+
 bool CompileServer::noRemote() const
 {
     return m_noRemote;
diff --git a/scheduler/compileserver.h b/scheduler/compileserver.h
index 331de4f..437445c 100644
--- a/scheduler/compileserver.h
+++ b/scheduler/compileserver.h
@@ -84,8 +84,9 @@ public:
     int maxJobs() const;
     void setMaxJobs(const int jobs);
     int maxPreloadCount() const;
-    // Counts both remote and local jobs.
     int currentJobCount() const;
+    int currentJobCountRemote() const;
+    int currentJobCountLocal() const;
 
     bool noRemote() const;
     void setNoRemote(const bool value);
-- 
2.35.3

From 116ed5379a21586f47a0f4d855343a5db5398af0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
Date: Mon, 9 May 2022 08:01:21 +0200
Subject: [PATCH] use larger timeout for "full" local jobs

When doing several LTO linking jobs, each of them may take quite a long
time, and it's not wanted that others time out and start running too
while one is already active.
---
 client/main.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/client/main.cpp b/client/main.cpp
index fdf4ac3..e4939ba 100644
--- a/client/main.cpp
+++ b/client/main.cpp
@@ -593,8 +593,9 @@ int main(int argc, char **argv)
         /* Inform the daemon that we like to start a job.  */
         if (local_daemon->send_msg(JobLocalBeginMsg(0, get_absfilename(job.outputFile()),fulljob))) {
             /* Now wait until the daemon gives us the start signal.  40 minutes
-               should be enough for all normal compile or link jobs.  */
-            startme = local_daemon->get_msg(40 * 60);
+               should be enough for all normal compile or link jobs, but with expensive jobs
+               (which fulljobs may likely be, e.g. LTO linking) use an even larger timeout.  */
+            startme = local_daemon->get_msg(fulljob ? 120 * 60 : 40 * 60);
         }
 
         /* If we can't talk to the daemon anymore we need to fall back
-- 
2.35.3

Places

File l-flto-all-slots.patch of Package icecream

Places