Convert Google Keep backup to Orgmode files
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

233 lines
8.1 KiB

  1. import os
  2. import html
  3. import sys
  4. import datetime
  5. """
  6. KeepToOrg.py
  7. Usage:
  8. python KeepToOrg.py /path/to/google/Keep output/dir
  9. Given a Takeout of your Google Keep Notes in .html format, output .org files with logical groupings
  10. based on tags. This will also format lists and try to be smart.
  11. """
  12. # TODO:
  13. # Format links:
  14. # Links have the syntax [[https://blah][Example link]] (things can be internal links too!)
  15. # See https://orgmode.org/manual/External-links.html
  16. # Convert an array of tags to an Emacs Org tag string
  17. # Tags have the syntax :tag: or :tag1:tag2:
  18. def tagsToOrgString(tags):
  19. if len(tags) == 0:
  20. return ''
  21. tagString = ':'
  22. for tag in tags:
  23. tagString += tag + ':'
  24. return tagString
  25. class Note:
  26. def __init__(self):
  27. self.title = ''
  28. self.body = ''
  29. self.tags = []
  30. self.archived = False
  31. # If no date can be parsed, set it to Jan 1, 2000
  32. self.date = datetime.datetime(2000, 1, 1)
  33. def toOrgString(self):
  34. # status = '(archived) ' if self.archived else ''
  35. # Create a copy so we can mangle it
  36. body = self.body
  37. title = self.title
  38. # Convert lists to org lists. This is a total hack but works
  39. body = body.replace('<li class="listitem"><span class="bullet">&#9744;</span>\n', '- [ ] ')
  40. body = body.replace('<li class="listitem checked"><span class="bullet">&#9745;</span>', '- [X] ')
  41. # Flat out remove these
  42. for htmlTagToErase in ['<span class="text">', '</span>', '</li>', '<ul class="list">', '</ul>']:
  43. body = body.replace(htmlTagToErase, '')
  44. # This is very weird, but fix the edge case where the list entry has a new line before the content
  45. for listTypeToFixNewLines in ['- [ ] \n','- [X] \n']:
  46. body = body.replace(listTypeToFixNewLines, listTypeToFixNewLines[:-1])
  47. # Unescape all (e.g. remove &quot and replace with ")
  48. title = html.unescape(title)
  49. body = html.unescape(body)
  50. for i, tag in enumerate(self.tags):
  51. self.tags[i] = html.unescape(tag)
  52. # Strip tags
  53. for tag in self.tags:
  54. body = body.replace('#{}'.format(tag), '')
  55. # Remove any leading/trailing whitespace (possibly leftover from tags stripping)
  56. body = body.strip()
  57. # Make a title if necessary
  58. orgTitle = title
  59. if not orgTitle:
  60. toNewline = body.find('\n')
  61. # If there's a line break; use the first line as a title
  62. if toNewline >= 0:
  63. orgTitle = body[:toNewline]
  64. body = body[len(orgTitle) + 1:]
  65. # The note has no breaks; make the body the title
  66. else:
  67. orgTitle = body
  68. # If the title is the whole body, clear the body
  69. body = ''
  70. nesting = '*' if self.archived else ''
  71. # Various levels of information require different formats
  72. if body or len(self.tags):
  73. if body and not len(self.tags):
  74. return '*{} {}\n{}'.format(nesting, orgTitle, body)
  75. if not body and len(self.tags):
  76. return '*{} {} {}\n'.format(nesting, orgTitle, tagsToOrgString(self.tags))
  77. else:
  78. return "*{} {} {}\n{}\n".format(nesting, orgTitle, body, tagsToOrgString(self.tags))
  79. # If no body nor tags, note should be a single line
  80. else:
  81. return '*{} {}'.format(nesting, orgTitle)
  82. def getAllNoteHtmlFiles(htmlDir):
  83. print('Looking for notes in {}'.format(htmlDir))
  84. noteHtmlFiles = []
  85. for root, dirs, files in os.walk(htmlDir):
  86. for file in files:
  87. if file.endswith('.html'):
  88. noteHtmlFiles.append(os.path.join(root, file))
  89. print ('Found {} notes'.format(len(noteHtmlFiles)))
  90. return noteHtmlFiles
  91. def getHtmlValueIfMatches(line, tag, endTag):
  92. if tag.lower() in line.lower() and endTag.lower() in line.lower():
  93. return line[line.find(tag) + len(tag):-(len(endTag) + 1)], True
  94. return '', False
  95. def makeSafeFilename(strToPurify):
  96. strToPurify = strToPurify.replace('/', '')
  97. strToPurify = strToPurify.replace('.', '')
  98. return strToPurify
  99. def main(keepHtmlDir, outputDir):
  100. noteFiles = getAllNoteHtmlFiles(keepHtmlDir)
  101. noteGroups = {}
  102. for noteFilePath in noteFiles:
  103. # Read in the file
  104. noteFile = open(noteFilePath)
  105. noteLines = noteFile.readlines()
  106. noteFile.close()
  107. # print('Parsing {}'.format(noteFilePath))
  108. note = Note()
  109. readState = 'lookingForAny'
  110. numOpenedDivs = 0
  111. for line in noteLines:
  112. isMatch = False
  113. numOpenedDivs += line.count('<div')
  114. numOpenedDivs -= line.count('</div>')
  115. if readState == 'lookingForAny':
  116. if '<span class="archived" title="Note archived">' in line:
  117. note.archived = True
  118. # Parse title
  119. title, isMatch = getHtmlValueIfMatches(line, '<div class="title">', '</div>')
  120. if isMatch:
  121. note.title = title
  122. continue
  123. if '<div class="content">' in line:
  124. readState = 'parsingBody'
  125. # This isn't great; for same-line bodies, strip opening div
  126. line = line.replace('<div class="content">', '')
  127. # Parse the date
  128. if ' AM</div>' in line or ' PM</div>' in line:
  129. dateString = line.replace('</div>', '').strip()
  130. # Example: "Apr 27, 2018, 6:32:15 PM"
  131. note.date = datetime.datetime.strptime(dateString, '%b %d, %Y, %I:%M:%S %p')
  132. # Parse tags, if any
  133. potentialTag, isMatch = getHtmlValueIfMatches(line, '<span class="label-name">', '</span>')
  134. if isMatch:
  135. note.tags.append(potentialTag)
  136. continue
  137. # Parse body
  138. if readState == 'parsingBody':
  139. if line.strip().lower() == '<br>':
  140. line = '\n'
  141. if line.strip().lower().endswith('</div>') and numOpenedDivs == 1:
  142. line = line[:-(len('</div>') + 1)]
  143. readState = 'lookingForAny'
  144. note.body += line.replace('<br>', '\n')
  145. # Add to groups based on tags
  146. for tag in note.tags:
  147. if tag in noteGroups:
  148. noteGroups[tag].append(note)
  149. else:
  150. noteGroups[tag] = [note]
  151. if not note.tags:
  152. if 'Untagged' in noteGroups:
  153. noteGroups['Untagged'].append(note)
  154. else:
  155. noteGroups['Untagged'] = [note]
  156. # We've parsed all the notes; write out the groups to separate .org files
  157. numNotesWritten = 0
  158. for tag, group in noteGroups.items():
  159. outFileName = '{}/{}.org'.format(outputDir, makeSafeFilename(tag))
  160. notesSortedByDate = sorted(group, key=lambda note: note.date)
  161. # If capture etc. appends, we should probably follow that same logic (don't reverse)
  162. # notesSortedByDate.reverse()
  163. # Concatenate all notes into lines
  164. lines = []
  165. archivedLines = []
  166. for note in notesSortedByDate:
  167. if note.archived:
  168. archivedLines.append(note.toOrgString() + '\n')
  169. else:
  170. lines.append(note.toOrgString() + '\n')
  171. if len(archivedLines):
  172. lines = ['* *Archived*\n'] + archivedLines + lines
  173. outFile = open(outFileName, 'w')
  174. outFile.writelines(lines)
  175. outFile.close()
  176. print('Wrote {} notes to {}'.format(len(group), outFileName))
  177. numNotesWritten += len(group)
  178. print('Wrote {} notes total'.format(numNotesWritten))
  179. if __name__ == '__main__':
  180. if len(sys.argv) != 3:
  181. print('Wrong number of arguments!\nUsage:\n\tpython KeepToOrg.py /path/to/google/Keep output/dir')
  182. else:
  183. keepHtmlDir = sys.argv[1]
  184. outputDir = sys.argv[2]
  185. main(keepHtmlDir, outputDir)