From d5f159e668497006ed8c0f6cf48d2c3c9c81b4f9 Mon Sep 17 00:00:00 2001
From: kexkey <github@kexkey.com>
Date: Mon, 17 Jan 2022 16:53:33 -0500
Subject: [PATCH] Retrying up to 5 times if DB migration fails...

---
 proxy_docker/app/script/startproxy.sh | 34 ++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/proxy_docker/app/script/startproxy.sh b/proxy_docker/app/script/startproxy.sh
index 3ba6b5d..793ceb3 100644
--- a/proxy_docker/app/script/startproxy.sh
+++ b/proxy_docker/app/script/startproxy.sh
@@ -23,15 +23,33 @@ createCurlConfig() {
 
 }
 
+# If the file .dbfailed exists, it means we previously failed to process DB migrations.
+# Sometimes, depending on timing, a migration fails but it doesn't mean it's corrupted.
+# It may be a container that was not accessible for a short period of time, for example.
+# So we'll try up to MAX_ATTEMPTS times before concluding in failure.
+
+# For this to work, we'll put the number of attemps in the .dbfailed file.
+
+MAX_ATTEMPTS=5
+
+nb_attempts=1
 if [ -e ${DB_PATH}/.dbfailed ]; then
+  n=$(cat ${DB_PATH}/.dbfailed)
+  nb_attempts=$((n+1))
+fi
+
+if [ "${nb_attempts}" -gt "${MAX_ATTEMPTS}" ]; then
   touch /container_monitor/proxy_dbfailed
-  trace "[startproxy] A previous database creation/migration failed.  Stopping."
-  trace "[startproxy] A file called .dbfailed has been created.  Fix the migration errors, remove .dbfailed and retry."
+  trace "[startproxy] Too many database creation/migration failed attempts.  Failed attempts = ${nb_attempts}."
+  trace "[startproxy] A file called .dbfailed has been created in your proxy datapath.  Fix the migration errors, remove .dbfailed and retry."
+  trace "[startproxy] Check your log files, especially postgres."
   trace "[startproxy] Exiting."
   sleep 30
   exit 1
 else
-  rm -f /container_monitor/proxy_dbfailed
+  if [ "${nb_attempts}" -gt "1" ]; then
+    trace "[startproxy] Current database creation/migration attempt = ${nb_attempts}.  Retrying..."
+  fi
 fi
 
 trace "[startproxy] Waiting for PostgreSQL to be ready..."
@@ -57,17 +75,21 @@ else
 fi
 
 if [ "${returncode}" -ne "0" ]; then
-  touch ${DB_PATH}/.dbfailed
-  touch /container_monitor/proxy_dbfailed
-  trace "[startproxy] Database creation/migration failed.  Stopping."
+  echo -n "${nb_attempts}" > ${DB_PATH}/.dbfailed
+  trace "[startproxy] Database creation/migration failed.  We will retry ${MAX_ATTEMPTS} times."
   trace "[startproxy] A file called .dbfailed has been created in your proxy datapath.  Fix the migration errors, remove .dbfailed and retry."
+  trace "[startproxy] Check your log files, especially postgres."
   trace "[startproxy] Exiting."
   sleep 30
   exit ${returncode}
 fi
 
+# /container_monitor/proxy_ready will be created by Docker's health check
 rm -f /container_monitor/proxy_ready
 
+rm -f /container_monitor/proxy_dbfailed
+rm -f ${DB_PATH}/.dbfailed
+
 chmod 0600 $DB_FILE
 
 createCurlConfig ${WATCHER_BTC_NODE_RPC_CFG} ${WATCHER_BTC_NODE_RPC_USER}