Pārlūkot izejas kodu

Create a tool for analysing excel tables

Eren Yilmaz 1 gadu atpakaļ
vecāks
revīzija
cb31a03f4f

+ 185 - 0
time_recoder/statistics.py

@@ -0,0 +1,185 @@
+import os
+from typing import List
+
+import numpy
+import pandas
+from pandas import DataFrame
+
+from tool_lib.tuned_pandasql import TunedPandaSQL
+from tool_lib.util import my_tabulate
+
+
+def load_from_disk():
+    dataframes = []
+    this_dir = os.path.dirname(__file__)
+    for xlsx_path in os.listdir(os.path.join(this_dir, 'time_recorded_tables')):
+        if not xlsx_path.endswith('.xlsx'):
+            continue
+        if xlsx_path.startswith('~$'):
+            continue
+        if 'example' in xlsx_path:
+            continue
+        dataframes.append(pandas.read_excel(os.path.join(this_dir, 'time_recorded_tables', xlsx_path)))
+    result = pandas.concat(dataframes)
+    result['Datum'] = pandas.to_datetime(result['Datum'])
+    result['Kalenderwoche'] = result['Datum'].dt.strftime('%V')
+    result.sort_values(by=['Datum'], inplace=True)
+
+    return result
+
+
+def get_date(row):
+    if pandas.isnull(row['Datum']):
+        return 'NULL'
+    return row['Datum'].strftime('%d.%m.%Y')
+
+
+def get_name(row):
+    return row['Name']
+
+
+def get_year(row):
+    if pandas.isnull(row['Datum']):
+        return 'NULL'
+    return row['Datum'].strftime('%Y')
+
+
+def get_month(row):
+    if pandas.isnull(row['Datum']):
+        return 'NULL'
+    return row['Datum'].strftime('%m.%Y')
+
+
+def get_task(row):
+    return row['Task']
+
+
+def get_work_time(row):
+    return row['Arbeitszeit']
+
+
+def get_work_time_minutes(row):
+    return row['Arbeitszeit in Min']
+
+
+def get_income(row):
+    return row['Einkommen']
+
+
+def get_overtime(row):
+    return row['Überstunden']
+
+
+def raw_df():
+    MAX_TASK_PARTS = 4
+
+    def get_calendar_week(row):
+        return row['Kalenderwoche']
+
+    columns = {
+        'Name': get_name,
+        'Datum': get_date,
+        'Monat': get_month,
+        'Jahr': get_year,
+        'Task': get_task,
+        'Kalenderwoche': get_calendar_week,
+        'Arbeitszeit': get_work_time,
+        'Arbeitszeit in Minuten': get_work_time_minutes,
+        'Einkommen': get_income,
+        'Überstunden': get_overtime,
+        **{
+            f'Task Präfix {i}': lambda row, i=i: get_task_prefix(row, i)
+            for i in range(1, MAX_TASK_PARTS)
+        }
+    }
+    combinations: List[tuple] = [
+        ('Name', 'Jahr'),
+        ('Name', 'Task'),
+        ('Name', 'Task Präfix 1'),
+        ('Name', 'Task Präfix 2'),
+        ('Name', 'Task Präfix 3'),
+    ]
+
+    for combination in combinations:
+        new_column_name = '_by_'.join(combination)
+        columns[new_column_name] = lambda *args, combination=combination, **kwargs: ', '.join(str(columns[column](*args, **kwargs)) for column in combination)
+
+    df = load_from_disk()
+    table = [
+        [columns[column](row) for column in columns]
+        for row in df.to_dict('records')
+    ]
+    return DataFrame(table, columns=list(columns))
+
+
+def get_task_prefix(row, prefix_length):
+    return "/".join(str(row['Task']).split('/')[:prefix_length])
+
+
+def preprocess_for_sql(x):
+    if isinstance(x, int):
+        return str(x)
+    elif isinstance(x, str):
+        x = x.replace("'", r"\'")
+        return f"'{x}'"
+    elif isinstance(x, float):
+        return str(x)
+    elif numpy.isscalar(x):
+        return str(x)
+    elif x is None:
+        return 'NULL'
+    else:
+        raise NotImplementedError(type(x))
+
+
+def first_column_values(values):
+    return list(values[values.columns[0]])
+
+
+def compute_rows(max_distinct_values=120, ):
+    df = raw_df()
+    result = []
+    if df.shape[0] == 0:
+        return result
+    relevant_columns = [column for column in df.columns
+                        if not column.startswith('_')]
+    distinct_column_values = TunedPandaSQL().multiquery(
+        {column: f'SELECT DISTINCT "{column}" FROM df' for column in df.columns},
+        {'df': df[relevant_columns]}
+    )
+    resultss = TunedPandaSQL().multiquery(
+        {
+            (column, v): f'SELECT Arbeitszeit, "Arbeitszeit in Minuten", Einkommen, Überstunden '
+                         f'FROM df WHERE "{column}" IS {v}'
+            for column in relevant_columns
+            for column_value in first_column_values(distinct_column_values[column])
+            if len(distinct_column_values[column]) <= max_distinct_values
+            for v in [preprocess_for_sql(column_value)]
+            if v != 'nan'
+        },
+        {'df': df[relevant_columns]}
+    )
+    for column in relevant_columns:
+        values: DataFrame = distinct_column_values[column]
+
+        if len(values) > max_distinct_values:
+            print(f'{len(values)} distinct values in column {column}')
+
+    for (column, v), results in resultss.items():
+        if column in ['Arbeitszeit', 'Arbeitszeit in Minuten', 'Einkommen', 'Überstunden']:
+            continue
+        new_row = [
+            column,
+            v,
+            results['Arbeitszeit'].sum(),
+            results['Arbeitszeit in Minuten'].sum(),
+            results['Einkommen'].sum(),
+            results['Überstunden'].sum(),
+        ]
+        result.append(new_row)
+    return result, ['Key', 'Value', 'Arbeitszeit', 'Arbeitszeit in Minuten', 'Einkommen', 'Überstunden']
+
+
+rows, columns = compute_rows()
+rows = sorted(rows, key=lambda row: (row[0], row[1]))
+print(my_tabulate(data=rows, headers=columns))

BIN
time_recoder/time_recorded_tables/work_time_eren.xlsx


+ 46 - 0
tool_lib/tuned_pandasql.py

@@ -0,0 +1,46 @@
+from typing import Dict, Any
+
+from pandasql import PandaSQL
+
+
+class TunedPandaSQL(PandaSQL):
+    def multiquery(self, queries: Dict[Any, str], env=None):
+        """
+        Execute the SQL query.
+        Automatically creates tables mentioned in the query from dataframes before executing.
+
+        :param query: SQL query string, which can reference pandas dataframes as SQL tables.
+        :param env: Variables environment - a dict mapping table names to pandas dataframes.
+        If not specified use local and global variables of the caller.
+        :return: Pandas dataframe with the result of the SQL query.
+        """
+        if env is None:
+            from pandasql.sqldf import get_outer_frame_variables
+            env = get_outer_frame_variables()
+
+        from pandasql.sqldf import extract_table_names
+        from pandasql.sqldf import write_table, read_sql, DatabaseError, PandaSQLException, ResourceClosedError
+        with self.conn as conn:
+            table_names = set(name
+                              for query in queries.values()
+                              for name in extract_table_names(query))
+            for table_name in table_names:
+                if table_name not in env:
+                    # don't raise error because the table may be already in the database
+                    continue
+                if self.persist and table_name in self.loaded_tables:
+                    # table was loaded before using the same instance, don't do it again
+                    continue
+                self.loaded_tables.add(table_name)
+                write_table(env[table_name], table_name, conn)
+            results = {}
+            for k, query in queries.items():
+                try:
+                    results[k] = read_sql(query, conn)
+                except DatabaseError as ex:
+                    raise PandaSQLException(ex)
+                except ResourceClosedError:
+                    # query returns nothing
+                    results[k] = None
+
+        return results

+ 1146 - 0
tool_lib/util.py

@@ -0,0 +1,1146 @@
+import datetime
+import faulthandler
+import functools
+import gc
+import inspect
+import json
+import math
+import os
+import random
+import re
+import sqlite3
+import sys
+import threading
+import time
+from bisect import bisect_left
+from enum import Enum
+from itertools import chain, combinations
+from math import log, isnan, nan, floor, log10, gcd
+from numbers import Number
+from shutil import copyfile
+from threading import RLock
+from types import FunctionType
+from typing import Union, Tuple, List, Optional, Dict, Any, Type
+# noinspection PyUnresolvedReferences
+from unittest import TestCase, mock
+
+import cachetools
+import hanging_threads
+import matplotlib.cm
+import matplotlib.pyplot as plt
+import numpy
+import numpy as np
+import pandas
+import scipy.optimize
+import scipy.stats
+import tabulate
+from scipy.ndimage import zoom
+
+from lib import stack_tracer, print_exc_plus
+from lib.my_logger import logging
+
+X = Y = Z = float
+
+
+class KnownIssue(Exception):
+    """
+    This means the code is not working and should not be used but still too valuable to be deleted
+    """
+    pass
+
+
+def powerset(iterable):
+    """powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"""
+    s = list(iterable)
+    return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))
+
+
+def plot_with_conf(x, y_mean, y_conf, alpha=0.5, **kwargs):
+    ax = kwargs.pop('ax', plt.gca())
+    base_line, = ax.plot(x, y_mean, **kwargs)
+    y_mean = np.array(y_mean)
+    y_conf = np.array(y_conf)
+    lb = y_mean - y_conf
+    ub = y_mean + y_conf
+
+    ax.fill_between(x, lb, ub, facecolor=base_line.get_color(), alpha=alpha)
+
+
+def choice(sequence, probabilities):
+    # if sum(probabilities) != 1:
+    #     raise AssertionError('Probabilities must sum to 1')
+    r = random.random()
+    for idx, c in enumerate(sequence):
+        r -= probabilities[idx]
+        if r < 0:
+            return c
+    raise AssertionError('Probabilities must sum to 1')
+
+
+def print_attributes(obj, include_methods=False, ignore=None):
+    if ignore is None:
+        ignore = []
+    for attr in dir(obj):
+        if attr in ignore:
+            continue
+        if attr.startswith('_'):
+            continue
+        if not include_methods and callable(obj.__getattr__(attr)):
+            continue
+        print(attr, ':', obj.__getattr__(attr).__class__.__name__, ':', obj.__getattr__(attr))
+
+
+def attr_dir(obj, include_methods=False, ignore=None):
+    if ignore is None:
+        ignore = []
+    return {attr: obj.__getattr__(attr)
+            for attr in dir(obj)
+            if not attr.startswith('_') and (
+                include_methods or not callable(obj.__getattr__(attr))) and attr not in ignore}
+
+
+def zoom_to_shape(a: np.ndarray, shape: Tuple, mode: str = 'smooth', verbose=1):
+    from keras import backend
+    a = np.array(a, dtype=backend.floatx())  # also does a copy
+    shape_dim = len(a.shape)
+    if len(a.shape) != len(shape):
+        raise ValueError('The shapes must have the same dimension but were len({0}) = {1} (original) '
+                         'and len({2}) = {3} desired.'.format(a.shape, len(a.shape), shape, len(shape)))
+    if len(shape) == 0:
+        return a
+    zoom_factors = tuple(shape[idx] / a.shape[idx] for idx in range(shape_dim))
+
+    def _current_index_in_old_array():
+        return tuple(slice(0, length) if axis != current_axis else slice(current_pixel_index, current_pixel_index + 1)
+                     for axis, length in enumerate(a.shape))
+
+    def _current_pixel_shape():
+        return tuple(length if axis != current_axis else 1
+                     for axis, length in enumerate(a.shape))
+
+    def _current_result_index():
+        return tuple(
+            slice(0, length) if axis != current_axis else slice(pixel_index_in_result, pixel_index_in_result + 1)
+            for axis, length in enumerate(a.shape))
+
+    def _current_result_shape():
+        return tuple(orig_length if axis != current_axis else shape[axis]
+                     for axis, orig_length in enumerate(a.shape))
+
+    if mode == 'constant':
+        result = zoom(a, zoom_factors)
+        assert result.shape == shape
+        return result
+    elif mode == 'smooth':
+        result = a
+        for current_axis, zoom_factor in sorted(enumerate(zoom_factors), key=lambda x: x[1]):
+            result = np.zeros(_current_result_shape(), dtype=backend.floatx())
+            # current_length = a.shape[current_axis]
+            desired_length = shape[current_axis]
+            current_pixel_index = 0
+            current_pixel_part = 0  # how much of the current pixel is already read
+            for pixel_index_in_result in range(desired_length):
+                pixels_remaining = 1 / zoom_factor
+                pixel_sum = np.zeros(_current_pixel_shape())
+                while pixels_remaining + current_pixel_part > 1:
+                    pixel_sum += (1 - current_pixel_part) * a[_current_index_in_old_array()]
+                    current_pixel_index += 1
+                    pixels_remaining -= (1 - current_pixel_part)
+                    current_pixel_part = 0
+
+                # the remaining pixel_part
+                try:
+                    pixel_sum += pixels_remaining * a[_current_index_in_old_array()]
+                except (IndexError, ValueError):
+                    if verbose:
+                        print('WARNING: Skipping {0} pixels because of numerical imprecision.'.format(pixels_remaining))
+                else:
+                    current_pixel_part += pixels_remaining
+
+                # insert to result
+                pixel_sum *= zoom_factor
+
+                result[_current_result_index()] = pixel_sum
+            a = result
+
+        assert result.shape == shape
+        return result
+    else:
+        return NotImplementedError('Mode not available.')
+
+
+def profile_wall_time_instead_if_profiling():
+    try:
+        import yappi
+    except ModuleNotFoundError:
+        return
+    currently_profiling = len(yappi.get_func_stats())
+    if currently_profiling and yappi.get_clock_type() != 'wall':
+        yappi.stop()
+        print('Profiling wall time instead of cpu time.')
+        yappi.clear_stats()
+        yappi.set_clock_type("wall")
+        yappi.start()
+
+
+def dummy_computation(*_args, **_kwargs):
+    pass
+
+
+def backup_file(filename):
+    copyfile(filename, backup_file_path(filename))
+
+
+def backup_file_path(filename):
+    return filename + time.strftime("%Y%m%d") + '.bak'
+
+
+# noinspection SpellCheckingInspection
+def my_tabulate(data, tablefmt='pipe', **params):
+    if data == [] and 'headers' in params:
+        data = [(None for _ in params['headers'])]
+    tabulate.MIN_PADDING = 0
+    return tabulate.tabulate(data, tablefmt=tablefmt, **params)
+
+
+def ce_loss(y_true, y_predicted):
+    return -(y_true * log(y_predicted) + (1 - y_true) * log(1 - y_predicted))
+
+
+class DontSaveResultsError(Exception):
+    pass
+
+
+def multinomial(n, bins):
+    if bins == 0:
+        if n > 0:
+            raise ValueError('Cannot distribute to 0 bins.')
+        return []
+    remaining = n
+    results = []
+    for i in range(bins - 1):
+        from numpy.random.mtrand import binomial
+        x = binomial(remaining, 1 / (bins - i))
+        results.append(x)
+        remaining -= x
+
+    results.append(remaining)
+    return results
+
+
+class UnknownTypeError(Exception):
+    pass
+
+
+# def shape_analysis(xs):
+#     composed_dtypes = [list, tuple, np.ndarray, dict, set]
+#     base_dtypes = [str, int, float, type, object]  # TODO add class and superclass of xs first element
+#     all_dtypes = composed_dtypes + base_dtypes
+#     if isinstance(xs, np.ndarray):
+#         outer_brackets = ('[', ']')
+#         shape = xs.shape
+#         dtype = xs.dtype
+#     elif isinstance(xs, tuple):
+#         outer_brackets = ('(', ')')
+#         shape = len(xs)
+#         dtype = [t for t in all_dtypes if all(isinstance(x, t) for x in xs)][0]
+#     elif isinstance(xs, list):
+#         outer_brackets = ('[', ']')
+#         shape = len(xs)
+#         dtype = [t for t in all_dtypes if all(isinstance(x, t) for x in xs)][0]
+#     elif isinstance(xs, dict) or isinstance(xs, set):
+#         outer_brackets = ('{', '}')
+#         shape = len(xs)
+#         dtype = [t for t in all_dtypes if all(isinstance(x, t) for x in xs)][0]
+#     elif any(isinstance(xs, t) for t in base_dtypes):
+#         for t in base_dtypes:
+#             if isinstance(xs, t):
+#                 return str(t.__name__)
+#         raise AssertionError('This should be unreachable.')
+#     else:
+#         raise UnknownTypeError('Unknown type:' + type(xs).__name__)
+#
+#     if shape and shape != '?':
+#         return outer_brackets[0] + str(xs.shape) + ' * ' + str(dtype) + outer_brackets[1]
+#     else:
+#         return outer_brackets[0] + outer_brackets[1]
+
+
+def beta_conf_interval_mle(data, conf=0.95):
+    if len(data) <= 1:
+        return 0, 1  # overestimates the interval
+    if any(d < 0 or d > 1 or isnan(d) for d in data):
+        return nan, nan
+    if numpy.var(data) == 0:
+        return numpy.mean(data), numpy.mean(data)
+    epsilon = 1e-3
+    # adjusted_data = data.copy()
+    # for idx in range(len(adjusted_data)):
+    #     adjusted_data[idx] *= (1 - 2 * epsilon)
+    #     adjusted_data[idx] += epsilon
+    alpha, beta, _, _ = scipy.stats.beta.fit(data, floc=-epsilon, fscale=1 + 2 * epsilon)
+
+    lower, upper = scipy.stats.beta.interval(alpha=conf, a=alpha, b=beta)
+    if lower < 0:
+        lower = 0
+    if upper < 0:
+        upper = 0
+    if lower > 1:
+        lower = 1
+    if upper > 1:
+        upper = 1
+    return lower, upper
+
+
+def gamma_conf_interval_mle(data, conf=0.95) -> Tuple[float, float]:
+    if len(data) == 0:
+        return nan, nan
+    if len(data) == 1:
+        return nan, nan
+    if any(d < 0 or isnan(d) for d in data):
+        return nan, nan
+    if numpy.var(data) == 0:
+        return numpy.mean(data).item(), 0
+    alpha, _, scale = scipy.stats.gamma.fit(data, floc=0)
+
+    lower, upper = scipy.stats.gamma.interval(alpha=conf, a=alpha, scale=scale)
+    if lower < 0:
+        lower = 0
+    if upper < 0:
+        upper = 0
+    return lower, upper
+
+
+beta_quantile_cache = cachetools.LRUCache(maxsize=10)
+
+
+@cachetools.cached(cache=beta_quantile_cache, key=lambda x1, p1, x2, p2, guess: (x1, x2, p1, p2))
+def beta_parameters_quantiles(x1, p1, x2, p2, guess=(3, 3)):
+    "Find parameters for a beta random variable X; so; that; P(X > x1) = p1 and P(X > x2) = p2.; "
+
+    def square(x):
+        return x * x
+
+    def objective(v):
+        (a, b) = v
+        temp = square(scipy.stats.beta.cdf(x1, a, b) - p1)
+        temp += square(scipy.stats.beta.cdf(x2, a, b) - p2)
+        return temp
+
+    xopt = scipy.optimize.fmin(objective, guess, disp=False)
+    return (xopt[0], xopt[1])
+
+
+def beta_conf_interval_quantile(data, conf=0.95, quantiles=(0.25, 0.75)):
+    if len(data) <= 1:
+        return 0, 1  # overestimates the interval
+    mu = numpy.mean(data)
+    v = numpy.var(data)
+    data = numpy.array(data)
+    if v == 0:
+        return mu, mu
+    lower = numpy.quantile(data, quantiles[0])
+    upper = numpy.quantile(data, quantiles[1])
+
+    alpha_guess = mu ** 2 * ((1 - mu) / v - 1 / mu)
+    beta_guess = alpha_guess * (1 / mu - 1)
+
+    alpha, beta = beta_parameters_quantiles(lower, quantiles[0], upper, quantiles[1], (alpha_guess, beta_guess))
+    return scipy.stats.beta.interval(alpha=conf, a=alpha, b=beta)
+
+
+def beta_stats_quantile(data, quantiles=(0.25, 0.75)):
+    if len(data) <= 1:
+        return 0, 1  # overestimates the interval
+    data = numpy.array(data)
+    mu = numpy.mean(data)
+    v = numpy.var(data)
+    if v == 0:
+        return mu, mu
+    lower = numpy.quantile(data, quantiles[0])
+    upper = numpy.quantile(data, quantiles[1])
+
+    alpha_guess = mu ** 2 * ((1 - mu) / v - 1 / mu)
+    beta_guess = alpha_guess * (1 / mu - 1)
+
+    alpha, beta = beta_parameters_quantiles(lower, quantiles[0], upper, quantiles[1], (alpha_guess, beta_guess))
+    return scipy.stats.beta.stats(a=alpha, b=beta)
+
+
+def beta_stats_mle(data):
+    if len(data) == 0:
+        return nan, nan
+    if len(data) == 1:
+        return nan, nan
+    if any(d < 0 or d > 1 or isnan(d) for d in data):
+        return nan, nan
+    if numpy.var(data) == 0:
+        return numpy.mean(data), 0
+    epsilon = 1e-4
+    # adjusted_data = data.copy()
+    # for idx in range(len(adjusted_data)):
+    #     adjusted_data[idx] *= (1 - 2 * epsilon)
+    #     adjusted_data[idx] += epsilon
+    alpha, beta, _, _ = scipy.stats.beta.fit(data, floc=-epsilon, fscale=1 + 2 * epsilon)
+
+    return scipy.stats.beta.stats(a=alpha, b=beta)
+
+
+def gamma_stats_mle(data):
+    if len(data) == 0:
+        return nan, nan
+    if len(data) == 1:
+        return nan, nan
+    if any(d < 0 or isnan(d) for d in data):
+        return nan, nan
+    if numpy.var(data) == 0:
+        return numpy.mean(data), 0
+    alpha, _, scale = scipy.stats.gamma.fit(data, floc=0)
+
+    return scipy.stats.gamma.stats(a=alpha, scale=scale)
+
+
+beta_stats = beta_stats_quantile
+beta_conf_interval = beta_conf_interval_quantile
+gamma_stats = gamma_stats_mle
+gamma_conf_interval = gamma_conf_interval_mle
+
+
+def split_df_list(df, target_column):
+    """
+    df = data frame to split,
+    target_column = the column containing the values to split
+    separator = the symbol used to perform the split
+    returns: a data frame with each entry for the target column separated, with each element moved into a new row.
+    The values in the other columns are duplicated across the newly divided rows.
+
+    SOURCE: https://gist.github.com/jlln/338b4b0b55bd6984f883
+    """
+
+    def split_list_to_rows(row, row_accumulator):
+        split_row = json.loads(row[target_column])
+        for s in split_row:
+            new_row = row.to_dict()
+            new_row[target_column] = s
+            row_accumulator.append(new_row)
+
+    new_rows = []
+    df.apply(split_list_to_rows, axis=1, args=(new_rows,))
+    new_df = pandas.DataFrame(new_rows)
+    return new_df
+
+
+try:
+    import winsound as win_sound
+
+
+    def beep(*args, **kwargs):
+        win_sound.Beep(*args, **kwargs)
+except ImportError:
+    win_sound = None
+
+
+    def beep(*_args, **_kwargs):
+        pass
+
+
+def round_to_digits(x, d):
+    if x == 0:
+        return 0
+    if isnan(x):
+        return nan
+    try:
+        return round(x, d - 1 - int(floor(log10(abs(x)))))
+    except OverflowError:
+        return x
+
+
+def gc_if_memory_error(f, *args, **kwargs):
+    try:
+        return f(*args, **kwargs)
+    except MemoryError:
+        print('Starting garbage collector')
+        gc.collect()
+        return f(*args, **kwargs)
+
+
+def assert_not_empty(x):
+    assert len(x)
+    return x
+
+
+def validation_steps(validation_dataset_size, maximum_batch_size):
+    batch_size = gcd(validation_dataset_size, maximum_batch_size)
+    steps = validation_dataset_size // batch_size
+    assert batch_size * steps == validation_dataset_size
+    return batch_size, steps
+
+
+def functional_dependency_trigger(connection: sqlite3.Connection,
+                                  table_name: str,
+                                  determining_columns: List[str],
+                                  determined_columns: List[str],
+                                  exist_ok: bool, ):
+    cursor = connection.cursor()
+    # possible_performance_improvements
+    determined_columns = [c for c in determined_columns if c not in determining_columns]
+    trigger_base_name = '_'.join([table_name] + determining_columns + ['determine'] + determined_columns)
+
+    error_message = ','.join(determining_columns) + ' must uniquely identify ' + ','.join(determined_columns)
+
+    # when inserting check if there is already an entry with these values
+    cursor.execute(f'''
+    CREATE TRIGGER {'IF NOT EXISTS' if exist_ok else ''} {trigger_base_name}_after_insert
+    BEFORE INSERT ON {table_name}
+    WHEN EXISTS(SELECT * FROM {table_name}
+         WHERE ({' AND '.join(f'NEW.{c} IS NOT NULL AND {c} = NEW.{c}' for c in determining_columns)})
+         AND ({' OR '.join(f'{c} != NEW.{c}' for c in determined_columns)}))
+    BEGIN SELECT RAISE(ROLLBACK, '{error_message}'); END
+    ''')
+
+    # when updating check if there is already an entry with these values (only if changed)
+    cursor.execute(f'''
+    CREATE TRIGGER {'IF NOT EXISTS' if exist_ok else ''} {trigger_base_name}_after_update
+    BEFORE UPDATE ON {table_name}
+    WHEN EXISTS(SELECT * FROM {table_name}
+         WHERE ({' AND '.join(f'NEW.{c} IS NOT NULL AND {c} = NEW.{c}' for c in determining_columns)})
+         AND ({' OR '.join(f'{c} != NEW.{c}' for c in determined_columns)}))
+    BEGIN SELECT RAISE(ROLLBACK, '{error_message}'); END
+    ''')
+
+
+def heatmap_from_points(x, y,
+                        x_lim: Optional[Union[int, Tuple[int, int]]] = None,
+                        y_lim: Optional[Union[int, Tuple[int, int]]] = None,
+                        gridsize=30):
+    if isinstance(x_lim, Number):
+        x_lim = (x_lim, x_lim)
+    if isinstance(y_lim, Number):
+        y_lim = (y_lim, y_lim)
+
+    plt.hexbin(x, y, gridsize=gridsize, cmap=matplotlib.cm.jet, bins=None)
+    if x_lim is not None:
+        plt.xlim(x_lim)
+    if y_lim is not None:
+        plt.ylim(y_lim)
+
+    cb = plt.colorbar()
+    cb.set_label('mean value')
+
+
+def strptime(date_string, fmt):
+    return datetime.datetime(*(time.strptime(date_string, fmt)[0:6]))
+
+
+class PrintLineRLock(RLock().__class__):
+    def __init__(self, *args, name='', **kwargs):
+        # noinspection PyArgumentList
+        super().__init__(*args, **kwargs)
+        self.name = name
+
+    def acquire(self, blocking: bool = True, timeout: float = -1) -> bool:
+        print(f'Trying to acquire Lock {self.name}')
+        result = RLock.acquire(self, blocking, timeout)
+        print(f'Acquired Lock {self.name}')
+        return result
+
+    def release(self) -> None:
+        print(f'Trying to release Lock {self.name}')
+        # noinspection PyNoneFunctionAssignment
+        result = RLock.release(self)
+        print(f'Released Lock {self.name}')
+        return result
+
+    def __enter__(self, *args, **kwargs):
+        print('Trying to enter Lock')
+        # noinspection PyArgumentList
+        super().__enter__(*args, **kwargs)
+        print('Entered Lock')
+
+    def __exit__(self, *args, **kwargs):
+        print('Trying to exit Lock')
+        super().__exit__(*args, **kwargs)
+        print('Exited Lock')
+
+
+def fixed_get_current_frames():
+    """Return current threads prepared for
+    further processing.
+    """
+    threads = {thread.ident: thread for thread in threading.enumerate()}
+    return {
+        thread_id: {
+            'frame': hanging_threads.thread2list(frame),
+            'time': None,
+            'id': thread_id,
+            'name': threads[thread_id].name,
+            'object': threads[thread_id]
+        } for thread_id, frame in sys._current_frames().items()
+        if thread_id in threads  # otherwise keyerrors might happen because of race conditions
+    }
+
+
+hanging_threads.get_current_frames = fixed_get_current_frames
+
+
+class CallCounter():
+    def __init__(self, f):
+        self.f = f
+        self.calls = 0
+        self.__name__ = f.__name__
+
+    def __call__(self, *args, **kwargs):
+        self.calls += 1
+        return self.f(*args, **kwargs)
+
+    def __str__(self):
+        return str(self.__dict__)
+
+    def __repr__(self):
+        return self.__class__.__name__ + repr(self.__dict__)
+
+
+def test_with_timeout(timeout=2):
+    def wrapper(f):
+        from lib.threading_timer_decorator import exit_after
+        f = exit_after(timeout)(f)
+
+        @functools.wraps(f)
+        def wrapped(*args, **kwargs):
+            try:
+                print(f'Running this test with timeout: {timeout}')
+                return f(*args, **kwargs)
+            except KeyboardInterrupt:
+                raise AssertionError(f'Test took longer than {timeout} seconds')
+
+        return wrapped
+
+    return wrapper
+
+
+def lru_cache_by_id(maxsize):
+    return cachetools.cached(cachetools.LRUCache(maxsize=maxsize), key=id)
+
+
+class EquivalenceRelation:
+    def equivalent(self, a, b) -> bool:
+        raise NotImplementedError('Abstract method')
+
+    def equivalence_classes(self, xs: list):
+        classes = []
+        for x in xs:
+            for c in classes:
+                if self.equivalent(x, c[0]):
+                    c.append(x)
+                    break
+            else:
+                classes.append([x])
+        return classes
+
+    def check_reflexivity_on_dataset(self, xs):
+        for x in xs:
+            if not self.equivalent(x, x):
+                return False
+        return True
+
+    def check_symmetry_on_dataset(self, xs):
+        for x in xs:
+            for y in xs:
+                if x is y:
+                    continue
+                if self.equivalent(x, y) and not self.equivalent(y, x):
+                    return False
+        return True
+
+    def check_axioms_on_dataset(self, xs):
+        return (
+            self.check_reflexivity_on_dataset(xs)
+            and self.check_symmetry_on_dataset(xs)
+            and self.check_transitivity_on_dataset(xs, assume_symmetry=True, assume_reflexivity=True)
+        )
+
+    def check_transitivity_on_dataset(self, xs, assume_symmetry=False, assume_reflexivity=False):
+        for x_idx, x in enumerate(xs):
+            for y_idx, y in enumerate(xs):
+                if x is y:
+                    continue
+                if self.equivalent(x, y):
+                    for z_idx, z in enumerate(xs):
+                        if y is z:
+                            continue
+                        if assume_symmetry and x_idx > z_idx:
+                            continue
+                        if assume_reflexivity and x is z:
+                            continue
+                        if self.equivalent(y, z):
+                            if not self.equivalent(x, z):
+                                return False
+        return True
+
+    def match_lists(self, xs, ys, filter_minimum_size=0, filter_maximum_size=math.inf):
+        xs = list(xs)
+        ys = list(ys)
+        if any(x is y for x in xs for y in ys):
+            raise ValueError('Lists contain the same element. This is currently not supported.')
+        classes = self.equivalence_classes([*xs, *ys])
+
+        return [
+            [
+                (0 if any(x2 is x for x2 in xs) else 1, x)
+                for x in c
+            ]
+            for c in classes[::-1]
+            if filter_minimum_size <= len(c) <= filter_maximum_size
+        ]
+
+
+def iff_patch(patch: mock._patch):
+    def decorator(f):
+        def wrapped(*args, **kwargs):
+            with patch:
+                f(*args, **kwargs)
+            try:
+                f(*args, **kwargs)
+            except:
+                pass
+            else:
+                raise AssertionError('Test did not fail without patch')
+
+        return wrapped
+
+    return decorator
+
+
+def iff_not_patch(patch: mock._patch):
+    def decorator(f):
+        def wrapped(*args, **kwargs):
+            f(*args, **kwargs)
+            try:
+                with patch:
+                    f(*args, **kwargs)
+            except Exception as e:
+                pass
+            else:
+                raise AssertionError('Test did not fail with patch')
+
+        return wrapped
+
+    return decorator
+
+
+EMAIL_CRASHES_TO = []
+VOICE_CALL_ON_CRASH: List[Tuple[str, str]] = []
+
+
+def list_logger(base_logging_function, store_in_list: list):
+    def print_and_store(*args, **kwargs):
+        base_logging_function(*args, **kwargs)
+        store_in_list.extend(args)
+
+    return print_and_store
+
+
+def main_wrapper(f):
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        start = time.perf_counter()
+        # import lib.stack_tracer
+        import __main__
+        # does not help much
+        # monitoring_thread = hanging_threads.start_monitoring(seconds_frozen=180, test_interval=1000)
+        os.makedirs('logs', exist_ok=True)
+        stack_tracer.trace_start('logs/' + os.path.split(__main__.__file__)[-1] + '.html', interval=5)
+        faulthandler.enable()
+        profile_wall_time_instead_if_profiling()
+
+        # noinspection PyBroadException
+        try:
+            return f(*args, **kwargs)
+        except KeyboardInterrupt:
+            error_messages = []
+            print_exc_plus.print_exc_plus(print=list_logger(logging.error, error_messages),
+                                          serialize_to='logs/' + os.path.split(__main__.__file__)[-1] + '.dill')
+        except:
+            error_messages = []
+            print_exc_plus.print_exc_plus(print=list_logger(logging.error, error_messages),
+                                          serialize_to='logs/' + os.path.split(__main__.__file__)[-1] + '.dill')
+            for recipient in EMAIL_CRASHES_TO:
+                from jobs.sending_emails import send_mail
+                send_mail.create_simple_mail_via_gmail(body='\n'.join(error_messages), filepath=None, excel_name=None, to_mail=recipient, subject='[python] Crash report')
+            for to_number, from_number in VOICE_CALL_ON_CRASH:
+                logging.info(f'Calling {from_number} to notify about the crash.')
+                voice_call('This is a notification message that one of your python scripts has crashed. If you are unsure about the origin of this call, please contact Eren Yilmaz.',
+                           to_number, from_number)
+        finally:
+            logging.info('Terminated.')
+            total_time = time.perf_counter() - start
+            faulthandler.disable()
+            stack_tracer.trace_stop()
+            frequency = 2000
+            duration = 500
+            beep(frequency, duration)
+            print('Total time', total_time)
+            try:
+                from algorithm_development.metatrader import ZeroMQ_Connector
+                ZeroMQ_Connector.DWX_ZeroMQ_Connector.deactivate_all()
+            except ImportError:
+                pass
+
+    return wrapper
+
+
+def voice_call(msg, to_number, from_number):
+    from twilio.rest import Client
+    account_sid = 'AC63c459168c3e4fe34e462acb4f44f748'
+    auth_token = 'b633bc0e945fe7cb737fdac395cc71d6'
+    client = Client(account_sid, auth_token)
+
+    call = client.calls.create(
+                            twiml=f'<Response><Say>{msg}</Say></Response>',
+                            from_=from_number,
+                            to=to_number,
+                        )
+
+    print(call.sid)
+
+
+
+def required_size_for_safe_rotation(base: Tuple[X, Y, Z], rotate_range_deg) -> Tuple[X, Y, Z]:
+    if abs(rotate_range_deg) > 45:
+        raise NotImplementedError
+    if abs(rotate_range_deg) > 0:
+        x_length = base[2] * math.sin(rotate_range_deg / 180 * math.pi) + base[1] * math.cos(
+            rotate_range_deg / 180 * math.pi)
+        y_length = base[2] * math.cos(rotate_range_deg / 180 * math.pi) + base[1] * math.sin(
+            rotate_range_deg / 180 * math.pi)
+        result = (base[0],
+                  x_length,
+                  y_length,)
+    else:
+        result = base
+    return result
+
+
+def round_to_closest_value(x, values, assume_sorted=False):
+    if not assume_sorted:
+        values = sorted(values)
+    next_largest = bisect_left(values, x)  # binary search
+    if next_largest == 0:
+        return values[0]
+    if next_largest == len(values):
+        return values[-1]
+    next_smallest = next_largest - 1
+    smaller = values[next_smallest]
+    larger = values[next_largest]
+    if abs(smaller - x) < abs(larger - x):
+        return smaller
+    else:
+        return larger
+
+
+def binary_search(a, x, lo=0, hi=None):
+    hi = hi if hi is not None else len(a)  # hi defaults to len(a)
+
+    pos = bisect_left(a, x, lo, hi)  # find insertion position
+
+    return pos if pos != hi and a[pos] == x else -1  # don't walk off the end
+
+
+def ceil_to_closest_value(x, values):
+    values = sorted(values)
+    next_largest = bisect_left(values, x)  # binary search
+    if next_largest < len(values):
+        return values[next_largest]
+    else:
+        return values[-1]  # if there is no larger value use the largest one
+
+
+def print_progress_bar(iteration, total, prefix='Progress:', suffix='', decimals=1, length=50, fill='█',
+                       print_eta=True):
+    """
+    Call in a loop to create terminal progress bar
+    @params:
+        iteration   - Required  : current iteration (Int)
+        total       - Required  : total iterations (Int)
+        prefix      - Optional  : prefix string (Str)
+        suffix      - Optional  : suffix string (Str)
+        decimals    - Optional  : positive number of decimals in percent complete (Int)
+        length      - Optional  : character length of bar (Int)
+        fill        - Optional  : bar fill character (Str)
+    """
+    percent = ("{0:" + str(4 + decimals) + "." + str(decimals) + "f}").format(100 * (iteration / float(total)))
+    filled_length = int(length * iteration // total)
+    bar = fill * filled_length + '-' * (length - filled_length)
+    if getattr(print_progress_bar, 'last_printed_value', None) == (prefix, bar, percent, suffix):
+        return
+    print_progress_bar.last_printed_value = (prefix, bar, percent, suffix)
+    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='')
+    # Print New Line on Complete
+    if iteration == total:
+        print()
+
+
+def get_all_subclasses(klass):
+    all_subclasses = []
+
+    for subclass in klass.__subclasses__():
+        all_subclasses.append(subclass)
+        all_subclasses.extend(get_all_subclasses(subclass))
+
+    return all_subclasses
+
+
+def my_mac_address():
+    """
+    https://stackoverflow.com/a/160821
+    """
+    import uuid
+    mac = uuid.getnode()
+    if (mac >> 40) % 2:
+        return None
+    mac = uuid.UUID(int=mac).hex[-12:]
+    return mac
+
+
+
+def latin1_json(data):
+    return json.dumps(data, ensure_ascii=False).encode('latin-1')
+
+
+def l2_norm(v1, v2):
+    if len(v1) != len(v2):
+        raise ValueError('Both vectors must be of the same size')
+    return math.sqrt(sum([(x1 - x2) * (x1 - x2) for x1, x2 in zip(v1, v2)]))
+
+
+def allow_additional_unused_keyword_arguments(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        allowed_kwargs = [param.name for param in inspect.signature(func).parameters.values()]
+        allowed_kwargs = {a: kwargs[a] for a in kwargs if a in allowed_kwargs}
+        return func(*args, **allowed_kwargs)
+
+    return wrapper
+
+
+def copy_and_rename_method(func, new_name):
+    funcdetails = [
+        func.__code__,
+        func.__globals__,
+        func.__name__,
+        func.__defaults__,
+        func.__closure__
+    ]
+    old_name = func.__name__
+    # copy
+    # new_func = dill.loads(dill.dumps(func))
+    new_func = FunctionType(*funcdetails)
+    assert new_func is not funcdetails
+    # rename
+    new_func.__name__ = new_name
+    assert func.__name__ is old_name
+    return new_func
+
+
+def rename(new_name):
+    def decorator(f):
+        f.__name__ = new_name
+        return f
+
+    return decorator
+
+
+class LogicError(Exception):
+    pass
+
+
+def round_time(dt=None, precision=60):
+    """Round a datetime object to any time lapse in seconds
+    dt : datetime.datetime object, default now.
+    roundTo : Closest number of seconds to round to, default 1 minute.
+    Author: Thierry Husson 2012 - Use it as you want but don't blame me.
+    """
+    if dt is None:
+        dt = datetime.datetime.now()
+    if isinstance(precision, datetime.timedelta):
+        precision = precision.total_seconds()
+    seconds = (dt.replace(tzinfo=None) - dt.min).seconds
+    rounding = (seconds + precision / 2) // precision * precision
+    return dt + datetime.timedelta(seconds=rounding - seconds,
+                                   microseconds=dt.microsecond)
+
+
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+
+def shorten_name(name):
+    name = re.sub(r'\s+', r' ', str(name))
+    name = name.replace(', ', ',')
+    name = name.replace(', ', ',')
+    name = name.replace(' ', '_')
+    return re.sub(r'([A-Za-z])[a-z]*_?', r'\1', str(name))
+
+
+def array_analysis(a: numpy.ndarray):
+    print(f'  Shape: {a.shape}')
+    mean = a.mean()
+    print(f'  Mean: {mean}')
+    print(f'  Std: {a.std()}')
+    print(f'  Min, Max: {a.min(), a.max()}')
+    print(f'  Mean absolute: {numpy.abs(a).mean()}')
+    print(f'  Mean square: {numpy.square(a).mean()}')
+    print(f'  Mean absolute difference from mean: {numpy.abs(a - mean).mean()}')
+    print(f'  Mean squared difference from mean: {numpy.square(a - mean).mean()}')
+    nonzero = numpy.count_nonzero(a)
+    print(f'  Number of non-zeros: {nonzero}')
+    print(f'  Number of zeros: {numpy.prod(a.shape) - nonzero}')
+    if a.shape[-1] > 1 and a.shape[-1] <= 1000:
+        # last dim is probably the number of classes
+        print(f'  Class counts: {numpy.count_nonzero(a, axis=tuple(range(len(a.shape) - 1)))}')
+
+
+def current_year_begin():
+    return datetime.datetime(datetime.datetime.today().year, 1, 1).timestamp()
+
+
+def current_day_begin():
+    return datetime.datetime.today().timestamp() // (3600 * 24) * (3600 * 24)
+
+
+def current_second_begin():
+    return floor(datetime.datetime.today().timestamp())
+
+
+def running_workers(executor):
+    print(next(iter(executor._threads)).__dict__)
+    return sum(1 for t in executor._threads
+               if t == 1)
+
+
+def queued_calls(executor):
+    return len(executor._work_queue.queue)
+
+
+def retry_on_error(max_tries=3, delay=0.5, backoff=2, only_error_classes=Exception):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            for i in range(max_tries):
+                try:
+                    return func(*args, **kwargs)
+                except only_error_classes as e:
+                    if i == max_tries - 1:
+                        raise
+                    logging.error(f'Re-try after error in {func.__name__}: {type(e).__name__}, {e}')
+                    time.sleep(delay * (backoff ** i))
+        return wrapper
+    return decorator
+
+
+
+class EBC:
+    SUBCLASSES_BY_NAME: Dict[str, Type['EBC']] = {}
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        EBC.SUBCLASSES_BY_NAME[cls.__name__] = cls
+
+    def __eq__(self, other):
+        return isinstance(other, type(self)) and self.__dict__ == other.__dict__
+
+    def __str__(self):
+        return str(self.__dict__)
+
+    def __repr__(self):
+        return f'{type(self).__name__}(**' + str(self.__dict__) + ')'
+
+    def to_json(self) -> Dict[str, Any]:
+        result: Dict[str, Any] = {
+            'type': type(self).__name__,
+            **self.__dict__,
+        }
+        for k in result:
+            if isinstance(result[k], EBC):
+                result[k] = result[k].to_json()
+            elif isinstance(result[k], numpy.ndarray):
+                result[k] = result[k].tolist()
+            elif isinstance(result[k], list):
+                result[k] = [r.to_json() if isinstance(r, EBC) else r
+                             for r in result[k]]
+        return result
+
+    @staticmethod
+    def from_json(data: Dict[str, Any]):
+        cls = EBC.SUBCLASSES_BY_NAME[data['type']]
+        return class_from_json(cls, data)
+
+
+def class_from_json(cls, data: Dict[str, Any]):
+    if isinstance(data, str):
+        data = json.loads(data)
+    # noinspection PyArgumentList
+    try:
+        return cls(**data)
+    except TypeError as e:
+        if "__init__() got an unexpected keyword argument 'type'" in str(e) or 'takes no arguments' in str(e):
+            # probably this was from a to_json method
+            if data['type'] != cls.__name__:
+                t = data['type']
+                logging.warning(f'Reconstructing a {cls.__name__} from a dict with type={t}')
+            data = data.copy()
+            del data['type']
+            for k,v  in data.items():
+                if probably_serialized_from_ebc(v):
+                    data[k] = EBC.SUBCLASSES_BY_NAME[v['type']].from_json(v)
+                elif isinstance(v, list):
+                    data[k] = [EBC.SUBCLASSES_BY_NAME[x['type']].from_json(x)
+                         if probably_serialized_from_ebc(x)
+                         else x
+                         for x in v]
+            return allow_additional_unused_keyword_arguments(cls)(**data)
+        else:
+            raise
+
+def probably_serialized_from_ebc(data):
+    return isinstance(data, dict) and 'type' in data and data['type'] in EBC.SUBCLASSES_BY_NAME
+
+
+class EBE(Enum):
+    def __int__(self):
+        return self.value
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return self.name
+
+    @classmethod
+    def from_name(cls, variable_name):
+        return cls.__dict__[variable_name]
+
+
+class Bunch(dict, EBC):
+    def __init__(self, **kwargs):
+        dict.__init__(self, kwargs)
+        self.__dict__.update(kwargs)
+
+    def add_method(self, m):
+        setattr(self, m.__name__, functools.partial(m, self))
+
+
+def floor_to_multiple_of(x, multiple_of):
+    return math.floor(x / multiple_of) * multiple_of
+
+
+def round_to_multiple_of(x, multiple_of):
+    return round(x / multiple_of) * multiple_of
+
+
+def ceil_to_multiple_of(x, multiple_of):
+    return math.ceil(x / multiple_of) * multiple_of
+
+