statistics.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. import math
  2. import os
  3. from typing import List, Union
  4. import numpy
  5. import pandas
  6. from pandas import DataFrame
  7. from tool_lib.tuned_pandasql import TunedPandaSQL
  8. from tool_lib.util import my_tabulate, EBC, EBE
  9. class Category(EBE):
  10. UNKNOWN = 'UNKNOWN'
  11. OPERATIVE = 'OPERATIVE'
  12. MEETING = 'MEETING'
  13. PLANNING = 'PLANNING'
  14. STRATEGY = 'STRATEGY'
  15. CONTROLLING = 'CONTROLLING'
  16. AUTOMATION = 'AUTOMATION'
  17. WASTED_TIME = 'WASTED_TIME'
  18. NOT_AVAILABLE = 'NOT_AVAILABLE'
  19. @classmethod
  20. def determine_category(cls, task: Union[str, float]) -> 'Category':
  21. task = str(task)
  22. if task == 'nan':
  23. return cls.NOT_AVAILABLE
  24. elif task == 'root':
  25. return cls.NOT_AVAILABLE
  26. elif 'meeting' in task.lower():
  27. return cls.MEETING
  28. elif 'eckard' in task.lower() or 'eckhard' in task.lower():
  29. return cls.MEETING
  30. elif 'unternehmensberatung' in task.lower():
  31. return cls.MEETING
  32. elif 'wirschaftsbüro' in task.lower() or 'wirtschaftsbüro' in task.lower():
  33. return cls.MEETING
  34. elif 'controlling' in task.lower():
  35. return cls.CONTROLLING
  36. elif 'strateg' in task.lower():
  37. return cls.STRATEGY
  38. elif 'arbeitserfassungstool' in task.lower():
  39. return cls.AUTOMATION
  40. elif 'worktime table' in task.lower():
  41. return cls.AUTOMATION
  42. elif 'automat' in task.lower():
  43. return cls.AUTOMATION
  44. elif 'template' in task.lower():
  45. return cls.AUTOMATION
  46. elif 'rechnung' in task.lower():
  47. return cls.AUTOMATION
  48. elif 'operativ' in task.lower():
  49. return cls.OPERATIVE
  50. elif 'website' in task.lower():
  51. return cls.OPERATIVE
  52. elif 'bunq' in task.lower():
  53. return cls.WASTED_TIME
  54. elif task == 'root/Master-Arbeitsstunden ':
  55. return cls.OPERATIVE
  56. elif 'planungsassistent' in task.lower():
  57. return cls.OPERATIVE
  58. elif 'telefonat' in task.lower():
  59. return cls.OPERATIVE
  60. elif 'umfrage' in task.lower():
  61. return cls.OPERATIVE
  62. elif 'code cleaning' in task.lower():
  63. return cls.OPERATIVE
  64. elif 'economy_calendar' in task.lower() or 'economy calendar' in task.lower():
  65. return cls.OPERATIVE
  66. elif 'tradingbot' in task.lower() or 'trading bot' in task.lower():
  67. return cls.OPERATIVE
  68. elif 'optiwae' in task.lower():
  69. return cls.OPERATIVE
  70. elif 'twilio' in task.lower():
  71. return cls.OPERATIVE
  72. elif 'linkguide' in task.lower():
  73. return cls.OPERATIVE
  74. elif 'plan' in task.lower():
  75. return cls.PLANNING
  76. elif 'tools' in task.lower():
  77. return cls.AUTOMATION
  78. else:
  79. return cls.UNKNOWN
  80. unknown_categories = set()
  81. def category_from_row(row):
  82. result = Category.determine_category(row['Task']).value
  83. if result == Category.UNKNOWN.value:
  84. unknown_categories.add(row['Task'])
  85. return result
  86. def load_from_disk():
  87. dataframes = []
  88. this_dir = os.path.dirname(__file__)
  89. for xlsx_path in os.listdir(os.path.join(this_dir, 'time_recorded_tables')):
  90. if not xlsx_path.endswith('.xlsx'):
  91. continue
  92. if xlsx_path.startswith('~$'):
  93. continue
  94. if 'example' in xlsx_path:
  95. continue
  96. dataframes.append(pandas.read_excel(os.path.join(this_dir, 'time_recorded_tables', xlsx_path)))
  97. result = pandas.concat(dataframes)
  98. result['Datum'] = pandas.to_datetime(result['Datum'])
  99. result['Kalenderwoche'] = result['Datum'].dt.strftime('%V')
  100. result.sort_values(by=['Datum'], inplace=True)
  101. return result
  102. def get_date(row):
  103. if pandas.isnull(row['Datum']):
  104. return 'NULL'
  105. return row['Datum'].strftime('%d.%m.%Y')
  106. def get_name(row):
  107. return row['Name']
  108. def get_year(row):
  109. if pandas.isnull(row['Datum']):
  110. return 'NULL'
  111. return row['Datum'].strftime('%Y')
  112. def get_month(row):
  113. if pandas.isnull(row['Datum']):
  114. return 'NULL'
  115. return row['Datum'].strftime('%m.%Y')
  116. def get_task(row):
  117. return row['Task']
  118. def get_work_time(row):
  119. return row['Arbeitszeit']
  120. def get_work_time_minutes(row):
  121. return row['Arbeitszeit in Min']
  122. def get_income(row):
  123. return row['Einkommen']
  124. def get_overtime(row):
  125. return row['Überstunden']
  126. def raw_df():
  127. MAX_TASK_PARTS = 4
  128. def get_calendar_week(row):
  129. return row['Kalenderwoche']
  130. columns = {
  131. 'Name': get_name,
  132. 'Datum': get_date,
  133. 'Monat': get_month,
  134. 'Jahr': get_year,
  135. 'Task': get_task,
  136. 'Kalenderwoche': get_calendar_week,
  137. 'Arbeitszeit': get_work_time,
  138. 'Arbeitszeit in Minuten': get_work_time_minutes,
  139. 'Einkommen': get_income,
  140. 'Überstunden': get_overtime,
  141. 'Kategorie': category_from_row,
  142. **{
  143. f'Task Präfix {i}': lambda row, i=i: get_task_prefix(row, i)
  144. for i in range(1, MAX_TASK_PARTS)
  145. }
  146. }
  147. combinations: List[tuple] = [
  148. ('Name', 'Jahr'),
  149. # ('Name', 'Task'),
  150. # ('Name', 'Task Präfix 1'),
  151. ('Name', 'Task Präfix 2'),
  152. # ('Name', 'Task Präfix 3'),
  153. ]
  154. for combination in combinations:
  155. new_column_name = '_by_'.join(combination)
  156. columns[new_column_name] = lambda *args, combination=combination, **kwargs: ', '.join(str(columns[column](*args, **kwargs)) for column in combination)
  157. df = load_from_disk()
  158. table = [
  159. [columns[column](row) for column in columns]
  160. for row in df.to_dict('records')
  161. ]
  162. return DataFrame(table, columns=list(columns))
  163. def get_task_prefix(row, prefix_length):
  164. return "/".join(str(row['Task']).split('/')[:prefix_length])
  165. def preprocess_for_sql(x):
  166. if isinstance(x, int):
  167. return str(x)
  168. elif isinstance(x, str):
  169. x = x.replace("'", r"\'")
  170. return f"'{x}'"
  171. elif isinstance(x, float):
  172. return str(x)
  173. elif numpy.isscalar(x):
  174. return str(x)
  175. elif x is None:
  176. return 'NULL'
  177. else:
  178. raise NotImplementedError(type(x))
  179. def first_column_values(values):
  180. return list(values[values.columns[0]])
  181. def compute_rows(max_distinct_values=120, ):
  182. df = raw_df()
  183. result = []
  184. if df.shape[0] == 0:
  185. return result
  186. relevant_columns = [column for column in df.columns
  187. if not column.startswith('_')]
  188. distinct_column_values = TunedPandaSQL().multiquery(
  189. {column: f'SELECT DISTINCT "{column}" FROM df' for column in df.columns},
  190. {'df': df[relevant_columns]}
  191. )
  192. resultss = TunedPandaSQL().multiquery(
  193. {
  194. (column, v): f'SELECT Arbeitszeit, "Arbeitszeit in Minuten", Einkommen, Überstunden '
  195. f'FROM df WHERE "{column}" IS {v}'
  196. for column in relevant_columns
  197. for column_value in first_column_values(distinct_column_values[column])
  198. if len(distinct_column_values[column]) <= max_distinct_values
  199. for v in [preprocess_for_sql(column_value)]
  200. if v != 'nan'
  201. },
  202. {'df': df[relevant_columns]}
  203. )
  204. for column in relevant_columns:
  205. values: DataFrame = distinct_column_values[column]
  206. if len(values) > max_distinct_values:
  207. print(f'{len(values)} distinct values in column {column}')
  208. for (column, v), results in resultss.items():
  209. if column in ['Arbeitszeit', 'Arbeitszeit in Minuten', 'Einkommen', 'Überstunden']:
  210. continue
  211. new_row = [
  212. column,
  213. v,
  214. results['Arbeitszeit'].sum(),
  215. results['Arbeitszeit in Minuten'].sum(),
  216. results['Einkommen'].sum(),
  217. results['Überstunden'].sum(),
  218. ]
  219. result.append(new_row)
  220. return result, ['Key', 'Value', 'Arbeitszeit', 'Arbeitszeit in Minuten', 'Einkommen', 'Überstunden']
  221. rows, columns = compute_rows()
  222. rows = sorted(rows, key=lambda row: (row[0], row[1]))
  223. print(my_tabulate(data=rows, headers=columns))
  224. print()
  225. print('Unspecified categories:')
  226. for c in unknown_categories:
  227. print(' -', repr(c))