Source code for src.dackar.validate

import jsonschema
import jsonpointer
import logging
import copy


[docs]
logger = logging.getLogger('DACKAR.validate')



[docs]
nlp_schema = {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "nlp": {
      "type": "object",
      "description": "NLP configuration settings",
      "properties": {
        "language_model": {
          "type": "string",
          "description": "Language model to be used"
        },
        "logger": {
          "type": "string",
          "description": "Logger path"
        },
        "ent": {
          "type": "object",
          "description": "Entity recognition settings",
          "properties": {
            "label": {
              "type": "string",
              "description": "Entity label"
            },
            "id": {
              "type": "string",
              "description": "Entity ID"
            }
          },
          "required": ["label", "id"],
          "additionalProperties": False
        },
        "files": {
          "type": "object",
          "description": "File paths for input data",
          "properties": {
            "text": {
              "type": "string",
              "description": "Path to text data file"
            },
            "entity": {
              "type": "string",
              "description": "Path to entity data file"
            },
            "opm": {
              "type": "string",
              "description": "Path to OPM model file"
            }
          },
          "required": ["text", "entity", "opm"],
          "additionalProperties": False
        },
        "processing": {
          "type": "object",
          "description": "Text processing settings",
          "properties": {
            "bullet_points": {
              "type": "boolean",
              "description": "Normalize bullet points"
            },
            "hyphenated_words": {
              "type": "boolean",
              "description": "Normalize hyphenated words"
            },
            "quotation_marks": {
              "type": "boolean",
              "description": "Normalize quotation marks"
            },
            "whitespace": {
              "type": "boolean",
              "description": "Normalize whitespace"
            },
            "numerize": {
              "type": "boolean",
              "description": "Convert numbers to digits"
            },
            "brackets": {
              "type": "boolean",
              "description": "Remove brackets"
            },
            "html_tags": {
              "type": "boolean",
              "description": "Remove HTML tags"
            },
            "punctuation": {
              "type": "array",
              "description": "List of punctuation marks to remove",
              "items": {
                "type": "string"
              }
            },
            "currency_symbols": {
              "type": "boolean",
              "description": "Replace currency symbols"
            },
            "emails": {
              "type": "boolean",
              "description": "Replace email addresses"
            },
            "emojis": {
              "type": "boolean",
              "description": "Replace emojis"
            },
            "hashtags": {
              "type": "boolean",
              "description": "Replace hashtags"
            },
            "numbers": {
              "type": "boolean",
              "description": "Replace numbers"
            },
            "phone_numbers": {
              "type": "boolean",
              "description": "Replace phone numbers"
            },
            "urls": {
              "type": "boolean",
              "description": "Replace URLs"
            },
            "user_handles": {
              "type": "boolean",
              "description": "Replace user handles"
            }
          },
          "additionalProperties": False
        },
        "ner": {
          "type": "object",
          "description": "NER (Named Entity Recognition) pipeline settings",
          "properties": {
            "unit": {
              "type": "boolean",
              "description": "Enable unit NER pipeline"
            },
            "temporal_relation": {
              "type": "boolean",
              "description": "Enable temporal relation NER pipeline"
            },
            "temporal": {
              "type": "boolean",
              "description": "Enable temporal NER pipeline"
            },
            "temporal_attribute": {
              "type": "boolean",
              "description": "Enable temporal attribute NER pipeline"
            },
            "location": {
              "type": "boolean",
              "description": "Enable location NER pipeline"
            },
            "emergent_activity": {
              "type": "boolean",
              "description": "Enable emergent activity NER pipeline"
            },
            "conjecture": {
              "type": "boolean",
              "description": "Enable conjecture NER pipeline"
            }
          },
          "additionalProperties": False
        },
        "causal": {
          "type": "object",
          "description": "Causal analysis settings",
          "properties": {
            "type": {
              "type": "string",
              "description": "Type of causal analysis",
              "enum": ["general", "phrase", "simple"]
            }
          },
          "additionalProperties": False
        },
        "outputs": {
          "type": "object",
          "description": "Output settings",
          "properties": {
            "csv": {
              "type": "boolean",
              "description": "Output results to CSV"
            },
            "visualize": {
              "type": "boolean",
              "description": "Enable visualization of results"
            }
          },
          "additionalProperties": False
        },
        "analysis": {
          "type": "object",
          "description": "Analysis type settings",
          "properties": {
            "type": {
              "type": "string",
              "description": "Type of analysis to perform",
              "enum": ["ner", "causal"]
            }
          },
          "required": ["type"],
          "additionalProperties": False
        }
      },
      "required": ["language_model", "ent", "files", "analysis"],
      "allOf": [
        {
          "if": {
            "properties": {
              "analysis": {
                "properties": {
                  "type": {
                    "const": "ner"
                  }
                }
              }
            }
          },
          "then": {
            "required": ["ner"]
          }
        },
        {
          "if": {
            "properties": {
              "analysis": {
                "properties": {
                  "type": {
                    "const": "causal"
                  }
                }
              }
            }
          },
          "then": {
            "required": ["causal"]
          }
        }
      ],
      "additionalProperties": False
    }
  },
  # "required": ["nlp"],
  # "additionalProperties": False
}



[docs]
neo4j_schema = {
  "type": "object",
  "description": "Schema for validating the Neo4j configuration TOML input.",
  "properties": {
    "neo4j": {
      "type": "object",
      "description": "Neo4j configuration settings.",
      "properties": {
        "uri": {
          "type": "string",
          "format": "uri",
          "description": "URI for connecting to the Neo4j database."
        },
        "pwd": {
          "type": "string",
          "description": "Password for connecting to the Neo4j database."
        },
        # "config_file_path": {
        #   "type": "string",
        #   "description": "Path to the Neo4j configuration file."
        # },
        # "import_folder_path": {
        #   "type": "string",
        #   "description": "Path to the folder where import data files are located."
        # },
        "reset": {
          "type": "boolean",
          "description": "Flag to indicate whether the database should be reset."
        },
        "node": {
          "type": "array",
          "description": "List of node configurations.",
          "items": {
            "type": "object",
            "description": "Configuration for a single node.",
            "properties": {
              "file": {
                "type": "string",
                "description": "Path to the CSV file containing node data."
              },
              "label": {
                "type": "string",
                "description": "Label for the node."
              },
              "attribute": {
                "type": "object",
                "description": "Mapping of node attributes.",
                "additionalProperties": {
                  "type": "string",
                  "description": "Attribute mapping."
                }
              }
            },
            "required": ["file", "label", "attribute"]
          }
        },
        "edge": {
          "type": "array",
          "description": "List of edge configurations.",
          "items": {
            "type": "object",
            "description": "Configuration for a single edge.",
            "properties": {
              "file": {
                "type": "string",
                "description": "Path to the CSV file containing edge data."
              },
              "source_label": {
                "type": "string",
                "description": "Label for the source node of the edge."
              },
              "target_label": {
                "type": "string",
                "description": "Label for the target node of the edge."
              },
              "label": {
                "type": "string",
                "description": "Label for the edge."
              },
              "label_attribute": {
                "type": "object",
                "description": "Mapping of edge label attributes.",
                "additionalProperties": {
                  "type": "string",
                  "description": "Label attribute mapping."
                },
                "default": None
              },
              "source_attribute": {
                "type": "object",
                "description": "Mapping of source node attributes.",
                "additionalProperties": {
                  "type": "string",
                  "description": "Source attribute mapping."
                }
              },
              "target_attribute": {
                "type": "object",
                "description": "Mapping of target node attributes.",
                "additionalProperties": {
                  "type": "string",
                  "description": "Target attribute mapping."
                }
              }
            },
            "required": ["file", "source_label", "target_label", "label", "source_attribute", "target_attribute"]
          }
        }
      },
      "required": ["uri", "pwd", "node", "edge"]
    }
  },
  # "required": ["neo4j"]
}



[docs]
schema = {}

schema.update(nlp_schema)
schema['properties'].update(neo4j_schema['properties'])


[docs]
def validateToml(config):
  """Validate TOML input file

  Args:
      config (dict): loaded toml input

  Returns:
      bool: True if valid
  """
  try:
    jsonschema.validate(instance=config, schema=schema)
    logger.info("TOML input file is valid.")
    return True
  except jsonschema.exceptions.ValidationError as e:
    logger.info("TOML input file is invalid.")
    logger.info(e.message)

    # Use jsonpointer to get the path to the error
    path = e.absolute_path
    pointer = jsonpointer.JsonPointer.from_parts(path)
    logger.info("Path to error: %s", pointer)

    # Optionally, print the part of the data causing the issue
    try:
        problematicData = pointer.resolve(config)
        logger.info("Problematic data: %s", problematicData)
    except jsonpointer.JsonPointerException:
        logger.info("Could not resolve the path to the problematic data.")

    return False