read_table.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import pandas as pd
  2. def add_zeros_zipcodes(list):
  3. for each_zipcode in range(0, len(list)):
  4. if len(list[each_zipcode]) < 5:
  5. counter = 5-len(list[each_zipcode])
  6. for each_missing_digit in range(0,counter):
  7. list[each_zipcode]='0'+list[each_zipcode]
  8. elif len(list[each_zipcode]) > 5:
  9. try:
  10. list[each_zipcode] = list[each_zipcode][0:list[each_zipcode].index('.0')]
  11. if len(list[each_zipcode]) < 5:
  12. counter = 5 - len(list[each_zipcode])
  13. for each_missing_digit in range(0, counter):
  14. list[each_zipcode] = '0' + list[each_zipcode]
  15. except:
  16. assert False, "zipcode {} contains more than 5 digits".format(list[each_zipcode])
  17. return list
  18. class Read_Table(object):
  19. def __init__(self,path, axes=[], encoding = 'utf-8', seperation = ';', type='xlsx', name=''):
  20. '''
  21. :param type: Currently Available csv, xlsx and ods
  22. '''
  23. self.axes = axes
  24. self.path = path
  25. self.encoding = encoding
  26. self.seperation = seperation
  27. self.type = type
  28. if type == 'xlsx':
  29. self.engine = 'openpyxl'
  30. self.name = name
  31. if type == 'ods':
  32. self.engine = 'odf'
  33. # low Level
  34. def read_table_excel_or_ods(self):
  35. return pd.read_excel(self.path, engine= self.engine)
  36. def read_table_SQL(self):
  37. raise NotImplementedError
  38. def read_table_csv_(self):
  39. return pd.read_csv(self.path, sep = self.seperation, encoding = self.encoding)
  40. # mid Level
  41. def get_column_titles(self):
  42. if self.type== 'xlsx':
  43. return self.read_table_excel_or_ods().columns.values
  44. if self.type== 'ods':
  45. raise NotImplementedError
  46. if self.type == 'csv':
  47. raise NotImplementedError
  48. def get_values_from_columns(self, column_titles: list):
  49. values=[]
  50. for each_element in column_titles:
  51. if self.type== 'xlsx':
  52. table= pd.read_excel(self.path, engine=self.engine)
  53. values+=[[val for val in table[each_element].dropna()]]
  54. if self.type== 'ods':
  55. raise NotImplementedError
  56. if self.type == 'csv':
  57. raise NotImplementedError
  58. return values
  59. # high Level
  60. def table_to_dict(self):
  61. if self.type == 'xlsx':
  62. column_titles = self.get_column_titles()
  63. values = self.get_values_from_columns(column_titles)
  64. return dict(zip(column_titles,values))
  65. if self.type == 'ods':
  66. raise NotImplementedError
  67. if self.type == 'csv':
  68. raise NotImplementedError
  69. class Read_unordered_Table(Read_Table):
  70. '''
  71. This Class is for Tables which do not have a column to value Struct
  72. '''
  73. def __init__(self,path, axes=[], encoding = 'utf-8', seperation = ';', type='xlsx', name=''):
  74. Read_Table.__init__(self,path, axes=axes, encoding = encoding, seperation = seperation, type=type, name=name)
  75. #High Level
  76. def get_values_after_key_as_dict(self, list_of_keys):
  77. dict_ = self.table_to_dict()
  78. ordered_dict= {}
  79. for each_key in list_of_keys:
  80. for each_list in dict_.values():
  81. for c,each_value in enumerate(each_list):
  82. if each_value == each_key:
  83. ordered_dict[each_value] = each_list[c+1:]
  84. return ordered_dict