spender-sandbox · SeanKim777 · Nov 2, 2016 · Nov 3, 2016
diff --git a/conf/reporting.conf b/conf/reporting.conf
@@ -107,6 +107,11 @@ resublimit = 5
 [malheur]
 enabled = no
 maxsimilar = 20
+# Created to run malheur in increment mode if analysis report generated more than increment_mode_threshold
+# https://github.com/spender-sandbox/cuckoo-modified/issues/335
+# default 10000
+# Set to 0 to disable increment mode
+increment_mode_threshold = 10000
 
 [compression]
 enabled = yes

diff --git a/modules/reporting/malheur.py b/modules/reporting/malheur.py
@@ -7,10 +7,17 @@
 import hashlib
 import urllib
 import random
+import logging
+import multiprocessing
 
 from lib.cuckoo.common.constants import CUCKOO_ROOT
 from lib.cuckoo.common.abstracts import Report
 from lib.cuckoo.common.exceptions import CuckooReportError
+from lib.cuckoo.core.database import Database
+from lib.cuckoo.common.config import Config
+
+db = Database()
+log = logging.getLogger(__name__)
 
 def sanitize_file(filename):
     normals = filename.lower().replace('\\', ' ').replace('.', ' ').split(' ')
@@ -144,41 +151,86 @@ def run(self, results):
         """Runs Malheur processing
         @return: Nothing.  Results of this processing are obtained at an arbitrary future time.
         """
+        self.lock = multiprocessing.Lock()
         if results["target"]["category"] in ["pcap"]:
             return
 
         basedir = os.path.join(CUCKOO_ROOT, "storage", "malheur")
         cfgpath = os.path.join(CUCKOO_ROOT, "conf", "malheur.conf")
-        reportsdir = os.path.join(basedir, "reports")
-        task_id = str(results["info"]["id"])
-        outputfile = os.path.join(basedir, "malheur.txt." + hashlib.md5(str(random.random())).hexdigest())
+        mh_conf = Config("reporting").malheur
+        self.lock.acquire()        
         try:
-            os.makedirs(reportsdir)
-        except:
-            pass
+            if mh_conf.increment_mode_threshold == 0 or mh_conf.increment_mode_threshold:
+                increment_mode_threshold = mh_conf.increment_mode_threshold
+            else:
+                increment_mode_threshold = 10000
 
-        mist = mist_convert(results)
-        if mist:
-            with open(os.path.join(reportsdir, task_id + ".txt"), "w") as outfile:
-                outfile.write(mist)
+            try:
+                task_id = str(results["info"]["id"])
+            except KeyError as e:
+                raise CuckooReportError("No key in result. Error %s" % e)
 
-        # might need to prevent concurrent modifications to internal state of malheur by only allowing
-        # one analysis to be running malheur at a time
+            num_reports = db.count_tasks()
 
-        path, dirs, files = os.walk(reportsdir).next()
-        try:
-            cmdline = ["malheur", "-c", cfgpath, "-o", outputfile, "cluster", reportsdir]
-            run = subprocess.Popen(cmdline, stdout=subprocess.PIPE,
-                                   stdin=subprocess.PIPE,
-                                   stderr=subprocess.PIPE)
-            out, err = run.communicate()
-            for line in err.splitlines():
-                if line.startswith("Warning: Discarding empty feature vector"):
-                    badfile = line.split("'")[1].split("'")[0]
-                    os.remove(os.path.join(reportsdir, badfile))
-
-            # replace previous classification state with new results atomically
-            os.rename(outputfile, outputfile[:-33])
-
-        except Exception as e:
-            raise CuckooReportError("Failed to perform Malheur classification: %s" % e)
+            if (increment_mode_threshold == 0) or (num_reports < increment_mode_threshold):
+                malheur_running_mode = "cluster"
+                reportsdir = os.path.join(basedir, "reports")
+            else:
+                malheur_running_mode = "increment"
+                # Create mist report into a separated directory to reduce malheur execution time using increment mode
+                reportsdir = os.path.join(basedir, "reports", task_id)
+
+            new_output_file = os.path.join(basedir, "malheur.txt." + hashlib.md5(str(random.random())).hexdigest())
+            previous_malheur_output_file = new_output_file[:-33]
+
+            if malheur_running_mode == "increment":
+                log.info("malheur will be running in increment mode")
+
+            try:
+                os.makedirs(reportsdir)
+            except:
+                pass
+
+            mist = mist_convert(results)
+            if mist:
+                with open(os.path.join(reportsdir, task_id + ".txt"), "w") as outfile:
+                    outfile.write(mist)
+            else:
+                # if no mist then no point to run malheur
+                raise CuckooReportError("Failed to extract mist data for task: %s" % task_id)
+
+            try:
+                cmdline = ["malheur", "-c", cfgpath, "-o", new_output_file, malheur_running_mode, reportsdir]
+                run = subprocess.Popen(cmdline, stdout=subprocess.PIPE,
+                                       stdin=subprocess.PIPE,
+                                       stderr=subprocess.PIPE)
+                out, err = run.communicate()
+                for line in err.splitlines():
+                    if line.startswith("Warning: Discarding empty feature vector"):
+                        badfile = line.split("'")[1].split("'")[0]
+                        os.remove(os.path.join(reportsdir, badfile))
+
+                if os.path.exists(new_output_file):
+                    if malheur_running_mode == "increment":
+                        # Append previous output result into output file just created (for merge with previous result)
+                        # This is for Django Web UI similar tab for analysis
+                        with open(new_output_file,'ab') as new_output:
+                            with open(previous_malheur_output_file, 'rb') as previous_output:
+                                for line in previous_output:
+                                    if line.startswith("#"):
+                                        continue
+                                    elif line.startswith(task_id):
+                                        # This is the case if run reporting multiple times for one task
+                                        continue
+                                    new_output.write(line)
+                else:
+                    # this will happen?
+                    raise CuckooReportError("Failed to generate output file from malheur execution for task: %s" % task_id)
+
+                # replace previous classification state with new results atomically
+                os.rename(new_output_file, previous_malheur_output_file)
+            except Exception as e:
+                raise CuckooReportError("Failed to perform Malheur classification: %s" % e)
+        finally:
+            # To make sure fh closed in any case otherwise other malheur process wait indefinitely
+            self.lock.release()