from elasticsearch import Elasticsearch, RequestsHttpConnection
import nbformat as nbf
import warnings
import requests
warnings.filterwarnings("ignore")
import glob


res = requests.get('http://host.docker.internal:9200')
print(res.content)

b'{\n  "name" : "829766e7847b",\n  "cluster_name" : "docker-cluster",\n  "cluster_uuid" : "mLWu9gbQQqqOy5xB3IONVg",\n  "version" : {\n    "number" : "7.13.1",\n    "build_flavor" : "default",\n    "build_type" : "docker",\n    "build_hash" : "9a7758028e4ea59bcab41c12004603c5a7dd84a9",\n    "build_date" : "2021-05-28T17:40:59.346932922Z",\n    "build_snapshot" : false,\n    "lucene_version" : "8.8.2",\n    "minimum_wire_compatibility_version" : "6.8.0",\n    "minimum_index_compatibility_version" : "6.0.0-beta1"\n  },\n  "tagline" : "You Know, for Search"\n}\n'


# Note "host.docker.internal" might be "localhost" if you are running an Anaconda version of Jupyter
es = Elasticsearch(hosts=[{"host": "host.docker.internal", "port": 9200}], 
                   connection_class=RequestsHttpConnection, max_retries=30,
                       retry_on_timeout=True, request_timeout=30)


#index some test data
es.index(index='testing-index', doc_type='test', id=1, body={'test': 'test'})

{'_index': 'testing-index',
 '_type': 'test',
 '_id': '1',
 '_version': 3,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 8,
 '_primary_term': 1}


res = es.get(index= "testing-index", id=1)
res

{'_index': 'testing-index',
 '_type': '_doc',
 '_id': '1',
 '_version': 3,
 '_seq_no': 8,
 '_primary_term': 1,
 'found': True,
 '_source': {'test': 'test'}}


es.delete(index='testing-index', doc_type='test', id=1)

{'_index': 'testing-index',
 '_type': 'test',
 '_id': '1',
 '_version': 4,
 'result': 'deleted',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 9,
 '_primary_term': 1}


pathToLocationJupyterNotebookFiles = "../work/HTMNotebooks/"
jupyterNotebooksFileNames = glob.glob(pathToLocationJupyterNotebookFiles + './*.ipynb')
jupyterNotebooksFileNames

['../work/HTMNotebooks/./HTMTest.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_0.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_1.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_10.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_11.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_2.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_3.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_4.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_5.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_6.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_7.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_8.ipynb',
 '../work/HTMNotebooks/./HTM_Overview_9.ipynb']


NB_VERSION = 4

def extractTextFromNotebook(notebook_str):
    formatted = nbf.read(notebook_str, as_version=NB_VERSION)
    text = []
    for cell in formatted.get('cells', []):
        if 'source' in cell and 'cell_type' in cell:
            if cell['cell_type'] == 'code' or cell['cell_type'] == 'markdown':
                text.append(cell['source'])

    return(text)


textFromNotebooks = [extractTextFromNotebook(jupyterNotebooksFileNames[i]) for i in range(len(jupyterNotebooksFileNames))]


textFromNotebooks[1][5]

'from htm.bindings.sdr import SDR, Metrics\nfrom htm.encoders.rdse import RDSE, RDSE_Parameters\nfrom htm.encoders.date import DateEncoder\nfrom htm.bindings.algorithms import SpatialPooler\nfrom htm.bindings.algorithms import TemporalMemory\nfrom htm.algorithms.anomaly_likelihood import AnomalyLikelihood \nfrom htm.bindings.algorithms import Predictor'


elasticDBName = "j-notebook-cell-search-index"

def writeTextCellsToElasticSearchDB(doc, notebookFilePath):
    for i in range(len(doc)):
        cellDict = {}
        cellDict['text'] =  doc[i],
        cellDict['noteBookFilePath'] =  notebookFilePath
        es.index(index= elasticDBName, doc_type= 'cell', body=cellDict)
    
[writeTextCellsToElasticSearchDB(textFromNotebooks[i], jupyterNotebooksFileNames[i]) for i in range(len(jupyterNotebooksFileNames))]

[None, None, None, None, None, None, None, None, None, None, None, None, None]


# Grab a particular record - note I just got the ID from http://localhost:9200/j-notebook-cell-search-index/_search?pretty=true&q=*:*
# now the elastic search index is up and running
es.get(index=elasticDBName, 
       doc_type="_doc", id = "VFtpKHoB3T1ThL6Sx1Yg")

{'_index': 'j-notebook-cell-search-index',
 '_type': '_doc',
 '_id': 'VFtpKHoB3T1ThL6Sx1Yg',
 '_version': 1,
 '_seq_no': 0,
 '_primary_term': 1,
 'found': True,
 '_source': {'text': ['import csv\nimport datetime\nimport os\nimport numpy as np\nimport random\nimport math\n\nfrom htm.bindings.sdr import SDR, Metrics\nfrom htm.encoders.rdse import RDSE, RDSE_Parameters\nfrom htm.encoders.date import DateEncoder\nfrom htm.bindings.algorithms import SpatialPooler\nfrom htm.bindings.algorithms import TemporalMemory\nfrom htm.algorithms.anomaly_likelihood import AnomalyLikelihood #FIXME use TM.anomaly instead, but it gives worse results than the py.AnomalyLikelihood now\nfrom htm.bindings.algorithms import Predictor'],
  'noteBookFilePath': '../work/HTMNotebooks/./HTMTest.ipynb'}}


q = {
  "query": {
      "prefix": {
          "text": "crim"
      }
  }}

es.search(index=elasticDBName, 
          body = q)

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'j-notebook-cell-search-index',
    '_type': 'cell',
    '_id': 'OltpKHoB3T1ThL6Szlec',
    '_score': 1.0,
    '_source': {'text': ['To explore this, let\'s use some data. There is some really interesting data that will turn up in episode\'s 7 and 8 in the context of the Spatial Pooler, that has some interesting info, but for now, let\'s use Baltimore Crime Data. This data has nice coverage across a number of data points, descriptive names, some categorical variables, the footprint isn\'t too big but it gives us a nice sample of 96k records\n\nInformation available <a href="https://data.baltimorecity.gov/datasets/baltimore::part1-crime-2015-to-2016/about">https://data.baltimorecity.gov/datasets/baltimore::part1-crime-2015-to-2016/about</a>\n'],
     'noteBookFilePath': '../work/HTMNotebooks/./HTM_Overview_5.ipynb'}},
   {'_index': 'j-notebook-cell-search-index',
    '_type': 'cell',
    '_id': 'PVtpKHoB3T1ThL6Szlez',
    '_score': 1.0,
    '_source': {'text': ['df = pd.read_csv("./data/Part1_Crime_2015_to__2016.csv")\ndf.CrimeDateTime = df.CrimeDateTime.str.slice(0, -8)\ndf.CrimeDateTime= pd.to_datetime(df.CrimeDateTime)\ndf[\'weekdayCodeWhenEventReported\'] = [d.weekday() for d in df.CrimeDateTime]\ndf[\'monthCodeWhenEventReported\'] = df[\'CrimeDateTime\'].dt.month\ndf[\'seasonCodeWhenEventReported\'] = (df[\'CrimeDateTime\'].dt.month - 1) % 4\ndf[\'isWeekend\'] = np.where(df.weekdayCodeWhenEventReported > 4, True, False)\ndf = df.drop(\'VRIName\', axis=1)\ndf = df.drop(\'HashedRecord\', axis=1)\ndf = df.drop(\'ObjectId\', axis=1)'],
     'noteBookFilePath': '../work/HTMNotebooks/./HTM_Overview_5.ipynb'}},
   {'_index': 'j-notebook-cell-search-index',
    '_type': 'cell',
    '_id': '0ltpKHoB3T1ThL6S01cp',
    '_score': 1.0,
    '_source': {'text': ["<h2>HTM Overview 9: Boosting</h2>\n\nSo now let's start working with. The first thing we want to do is create a Scalar Encoder\n\nCrime data"],
     'noteBookFilePath': '../work/HTMNotebooks/./HTM_Overview_9.ipynb'}}]}}