run__benchmark__import_8py_source.html

 import os

 import timeit

 import logging

 import uuid

 import datetime

 import json

 import pymapd

 import pandas

 import re

 from argparse import ArgumentParser


 # For usage info, run: `./<script_name>.py --help`


 def get_connection(**kwargs):

     """

       Connects to the db using pymapd

       https://pymapd.readthedocs.io/en/latest/usage.html#connecting


       Kwargs:

         db_user(str): DB username

         db_passwd(str): DB password

         db_server(str): DB host

         db_port(int): DB port

         db_name(str): DB name


       Returns:

         con(class): Connection class

         False(bool): The connection failed. Exception should be logged.

     """

     try:

         logging.debug("Connecting to mapd db...")

         con = pymapd.connect(

             user=kwargs["db_user"],

             password=kwargs["db_passwd"],

             host=kwargs["db_server"],

             port=kwargs["db_port"],

             dbname=kwargs["db_name"],

         )

         logging.info("Succesfully connected to mapd db")

         return con

     except (pymapd.exceptions.OperationalError, pymapd.exceptions.Error):

         logging.exception("Error connecting to database.")

         return False


 def json_format_handler(x):

     # Function to allow json to deal with datetime and numpy int

     if isinstance(x, datetime.datetime):

         return x.isoformat()

     raise TypeError("Unknown type")


 # Parse input parameters

 parser = ArgumentParser()

 optional = parser._action_groups.pop()

 required = parser.add_argument_group("required arguments")

 parser._action_groups.append(optional)

 optional.add_argument(

     "-v", "--verbose", action="store_true", help="Turn on debug logging"

 )

 optional.add_argument(

     "-q",

     "--quiet",

     action="store_true",

     help="Suppress script outuput " + "(except warnings and errors)",

 )

 required.add_argument(

     "-u", "--user", dest="user", default="mapd", help="Source database user"

 )

 required.add_argument(

     "-p",

     "--passwd",

     dest="passwd",

     default="HyperInteractive",

     help="Source database password",

 )

 required.add_argument(

     "-s",

     "--server",

     dest="server",

     default="localhost",

     help="Source database server hostname",

 )

 optional.add_argument(

     "-o",

     "--port",

     dest="port",

     type=int,

     default=6274,

     help="Source database server port",

 )

 required.add_argument(

     "-n", "--name", dest="name", default="mapd", help="Source database name"

 )

 required.add_argument(

     "-l", "--label", dest="label", required=True, help="Benchmark run label"

 )

 required.add_argument(

     "-f",

     "--import-file",

     dest="import_file",

     required=True,

     help="Absolute path to file on heavydb machine with data for "

     + "import test",

 )

 required.add_argument(

     "-c",

     "--table-schema-file",

     dest="table_schema_file",

     required=True,

     help="Path to local file with CREATE TABLE sql statement for the "

     + "import table",

 )

 optional.add_argument(

     "-t",

     "--import-table-name",

     dest="import_table_name",

     default="import_benchmark_test",

     help="Name of table to import data to. NOTE: This table will be dropped "

     + "before and after the import test, unless "

     + "--no-drop-table-[before/after] is specified.",

 )

 optional.add_argument(

     "-F",

     "--import-query-template-file",

     dest="import_query_template_file",

     help="Path to file containing template for import query. "

     + 'The script will replace "##TAB##" with the value of import_table_name '

     + 'and "##FILE##" with the value of table_schema_file. By default, the '

     + "script will use the COPY FROM command with the default default "

     + "delimiter (,).",

 )

 optional.add_argument(

     "--no-drop-table-before",

     dest="no_drop_table_before",

     action="store_true",

     help="Do not drop the import table and recreate it before import "

     + "NOTE: Make sure existing table schema matches import .csv file schema",

 )

 optional.add_argument(

     "--no-drop-table-after",

     dest="no_drop_table_after",

     action="store_true",

     help="Do not drop the import table after import",

 )

 optional.add_argument(

     "-A",

     "--import-test-name",

     dest="import_test_name",

     help='Name of import test (ex: "ips"). Required when using '

     + "jenkins_bench_json as output.",

 )

 optional.add_argument(

     "-m", "--machine-name", dest="machine_name", help="Name of source machine"

 )

 optional.add_argument(

     "-a",

     "--machine-uname",

     dest="machine_uname",

     help="Uname info from " + "source machine",

 )

 optional.add_argument(

     "-e",

     "--destination",

     dest="destination",

     default="mapd_db",

     help="Destination type: [mapd_db, file_json, output, jenkins_bench] "

     + "Multiple values can be input seperated by commas, "

     + 'ex: "mapd_db,file_json"',

 )

 optional.add_argument(

     "-U",

     "--dest-user",

     dest="dest_user",

     default="mapd",

     help="Destination mapd_db database user",

 )

 optional.add_argument(

     "-P",

     "--dest-passwd",

     dest="dest_passwd",

     default="HyperInteractive",

     help="Destination mapd_db database password",

 )

 optional.add_argument(

     "-S",

     "--dest-server",

     dest="dest_server",

     help="Destination mapd_db database server hostname"

     + ' (required if destination = "mapd_db")',

 )

 optional.add_argument(

     "-O",

     "--dest-port",

     dest="dest_port",

     type=int,

     default=6274,

     help="Destination mapd_db database server port",

 )

 optional.add_argument(

     "-N",

     "--dest-name",

     dest="dest_name",

     default="mapd",

     help="Destination mapd_db database name",

 )

 optional.add_argument(

     "-T",

     "--dest-table",

     dest="dest_table",

     default="import_results",

     help="Destination mapd_db table name",

 )

 optional.add_argument(

     "-C",

     "--dest-table-schema-file",

     dest="dest_table_schema_file",

     default="results_table_schemas/import-results.sql",

     help="Destination table schema file. This must be an executable CREATE "

     + "TABLE statement that matches the output of this script. It is "

     + "required when creating the results table. Default location is in "

     + '"./results_table_schemas/query-results.sql"',

 )

 optional.add_argument(

     "-j",

     "--output-file-json",

     dest="output_file_json",

     help="Absolute path of .json output file "

     + '(required if destination = "file_json")',

 )

 optional.add_argument(

     "-J",

     "--output-file-jenkins",

     dest="output_file_jenkins",

     help="Absolute path of jenkins benchmark .json output file "

     + '(required if destination = "jenkins_bench")',

 )

 optional.add_argument(

     "--fragment-size",

     dest="fragment_size",

     default="32000000",

     help="Fragment size to be used to create table. File specified in -c"

     + " is modified to replace ##FRAGMENT_SIZE## with value specified for"

     + " this parameter."

 )

 args = parser.parse_args()

 if args.verbose:

     logging.basicConfig(level=logging.DEBUG)

 elif args.quiet:

     logging.basicConfig(level=logging.WARNING)

 else:

     logging.basicConfig(level=logging.INFO)

 source_db_user = args.user

 source_db_passwd = args.passwd

 source_db_server = args.server

 source_db_port = args.port

 source_db_name = args.name

 label = args.label

 import_file = args.import_file

 table_schema_file = args.table_schema_file

 import_table_name = args.import_table_name

 import_query_template_file = args.import_query_template_file

 no_drop_table_before = args.no_drop_table_before

 no_drop_table_after = args.no_drop_table_after

 import_test_name = args.import_test_name

 machine_name = args.machine_name

 machine_uname = args.machine_uname

 destinations = args.destination.split(",")

 if "mapd_db" in destinations:

     valid_destination_set = True

     dest_db_user = args.dest_user

     dest_db_passwd = args.dest_passwd

     if args.dest_server is None:

         # If dest_server is not set for mapd_db, then exit

         logging.error('"dest_server" is required when destination = "mapd_db"')

         exit(1)

     else:

         dest_db_server = args.dest_server

     dest_db_port = args.dest_port

     dest_db_name = args.dest_name

     dest_table = args.dest_table

     dest_table_schema_file = args.dest_table_schema_file

 if "file_json" in destinations:

     valid_destination_set = True

     if args.output_file_json is None:

         # If output_file_json is not set for file_json, then exit

         logging.error(

             '"output_file_json" is required when destination = "file_json"'

         )

         exit(1)

     else:

         output_file_json = args.output_file_json

 if "output" in destinations:

     valid_destination_set = True

 if "jenkins_bench" in destinations:

     valid_destination_set = True

     if args.output_file_jenkins is None:

         # If output_file_jenkins is not set for jenkins_bench, then exit

         logging.error(

             '"output_file_jenkins" is required '

             + 'when destination = "jenkins_bench"'

         )

         exit(1)

     elif args.import_test_name is None:

         # If import_test_name is not set for jenkins_bench, then exit

         logging.error(

             '"import_test_name" is required '

             + 'when destination = "jenkins_bench"'

         )

         exit(1)

     else:

         output_file_jenkins = args.output_file_jenkins

 if not valid_destination_set:

     logging.error("No valid destination(s) have been set. Exiting.")

     exit(1)


 # Establish connection to mapd db

 con = get_connection(

     db_user=source_db_user,

     db_passwd=source_db_passwd,

     db_server=source_db_server,

     db_port=source_db_port,

     db_name=source_db_name,

 )

 if not con:

     exit(1)  # Exit if cannot connect to db


 # Set run vars

 run_guid = str(uuid.uuid4())

 logging.debug("Run guid: " + run_guid)

 run_timestamp = datetime.datetime.now()

 run_connection = str(con)

 logging.debug("Connection string: " + run_connection)

 run_driver = ""  # TODO

 run_version = con._client.get_version()

 if "-" in run_version:

     run_version_short = run_version.split("-")[0]

 else:

     run_version_short = run_version

 conn_machine_name = re.search(r"@(.*?):", run_connection).group(1)

 # Set machine names, using local info if connected to localhost

 if conn_machine_name == "localhost":

     local_uname = os.uname()

 if machine_name:

     run_machine_name = machine_name

 else:

     if conn_machine_name == "localhost":

         run_machine_name = local_uname.nodename

     else:

         run_machine_name = conn_machine_name

 if machine_uname:

     run_machine_uname = machine_uname

 else:

     if conn_machine_name == "localhost":

         run_machine_uname = " ".join(local_uname)

     else:

         run_machine_uname = ""


 # See if import table exists, delete and/or create

 if not no_drop_table_before:

     logging.info("Dropping import table if exists")

     con.execute("drop table if exists " + import_table_name)

     logging.debug("Creating import table.")

     try:

         with open(table_schema_file, "r") as table_schema:

             logging.debug("Reading table_schema_file: " + table_schema_file)

             create_table_sql = table_schema.read().replace("\n", " ")

             create_table_sql = create_table_sql.replace(

                 "##TAB##", import_table_name

             )

             create_table_sql = create_table_sql.replace(

                 "##FRAGMENT_SIZE##", args.fragment_size

             )

     except FileNotFoundError:

         logging.exception("Could not find table_schema_file.")

         exit(1)

     try:

         logging.debug("Creating import table...")

         res = con.execute(create_table_sql)

         logging.debug("Import table created.")

     except (pymapd.exceptions.ProgrammingError, pymapd.exceptions.Error):

         logging.exception("Error running table creation")

         exit(1)


 # Run the import query

 if import_query_template_file:

     try:

         with open(import_query_template_file, "r") as import_query_template:

             logging.debug(

                 "Reading import_query_template_file: " + import_query_template_file

             )

             import_query = import_query_template.read().replace("\n", " ")

             import_query = import_query.replace(

                 "##TAB##", import_table_name

             )

             import_query = import_query.replace(

                 "##FILE##", import_file

             )

     except FileNotFoundError:

         logging.exception("Could not find import_query_template_file.")

         exit(1)

 else:

     import_query = "COPY %s FROM '%s';" % (import_table_name, import_file)

 logging.debug("Import query: " + import_query)

 logging.info("Starting import...")

 start_time = timeit.default_timer()

 try:

     res = con.execute(import_query)

     end_time = timeit.default_timer()

     logging.info("Completed import.")

 except (pymapd.exceptions.ProgrammingError, pymapd.exceptions.Error):

     logging.exception("Error running import query")

     if no_drop_table_before:

         logging.error(

             'Import failed and "--no-drop-table-before" was '

             + "passed in. Make sure existing table schema matches "

             + "import .csv file schema."

         )

     exit(1)


 # Calculate times and results values

 query_elapsed_time = round(((end_time - start_time) * 1000), 1)

 execution_time = res._result.execution_time_ms

 connect_time = round((query_elapsed_time - execution_time), 2)

 res_output = str(res.fetchall()[0])

 logging.debug("Query result output: " + res_output)

 rows_loaded = re.search(r"Loaded: (.*?) recs, R", res_output).group(1)

 rows_rejected = re.search(r"Rejected: (.*?) recs i", res_output).group(1)


 # Done with import. Dropping table and closing connection

 if not no_drop_table_after:

     logging.debug("Dropping import table")

     con.execute("drop table " + import_table_name)

 logging.debug("Closing source db connection.")

 con.close()


 # Update query dict entry with all values

 result = {

     "run_guid": run_guid,

     "run_timestamp": run_timestamp,

     "run_connection": run_connection,

     "run_machine_name": run_machine_name,

     "run_machine_uname": run_machine_uname,

     "run_driver": run_driver,

     "run_version": run_version,

     "run_label": label,

     "import_test_name": import_test_name,

     "import_elapsed_time_ms": query_elapsed_time,

     "import_execute_time_ms": execution_time,

     "import_conn_time_ms": connect_time,

     "rows_loaded": rows_loaded,

     "rows_rejected": rows_rejected,

 }


 # Convert query list to json for outputs

 result_json = json.dumps(result, default=json_format_handler, indent=2)


 # Send results

 if "mapd_db" in destinations:

     # Create dataframe from list of query results

     logging.debug("Converting results list to pandas dataframe")

     results_df = pandas.DataFrame(result, index=[0])

     # Establish connection to destination mapd db

     logging.debug("Connecting to destination mapd db")

     dest_con = get_connection(

         db_user=dest_db_user,

         db_passwd=dest_db_passwd,

         db_server=dest_db_server,

         db_port=dest_db_port,

         db_name=dest_db_name,

     )

     if not dest_con:

         exit(1)  # Exit if cannot connect to destination db

     # Load results into db, creating table if it does not exist

     tables = dest_con.get_tables()

     if dest_table not in tables:

         logging.info("Destination table does not exist. Creating.")

         try:

             with open(dest_table_schema_file, "r") as table_schema:

                 logging.debug(

                     "Reading table_schema_file: " + dest_table_schema_file

                 )

                 create_table_sql = table_schema.read().replace("\n", " ")

                 create_table_sql = create_table_sql.replace(

                     "##TAB##", dest_table

                 )

         except FileNotFoundError:

             logging.exception("Could not find table_schema_file.")

             exit(1)

         try:

             logging.debug("Executing create destination table query")

             res = dest_con.execute(create_table_sql)

             logging.debug("Destination table created.")

         except (pymapd.exceptions.ProgrammingError, pymapd.exceptions.Error):

             logging.exception("Error running table creation")

             exit(1)

     logging.info("Loading results into destination db")

     dest_con.load_table(

         dest_table, results_df, method="columnar", create=False

     )

     dest_con.close()

 if "file_json" in destinations:

     # Write to json file

     logging.debug("Opening json output file for writing")

     file_json_open = open(output_file_json, "w")

     logging.info("Writing to output json file: " + output_file_json)

     file_json_open.write(result_json)

 if "jenkins_bench" in destinations:

     # Write output to file formatted for jenkins benchmark plugin

     # https://github.com/jenkinsci/benchmark-plugin

     logging.debug("Constructing output for jenkins benchmark plugin")

     jenkins_bench_json = json.dumps(

         {

             "groups": [

                 {

                     "name": import_test_name,

                     "description": "Import: " + import_test_name,

                     "tests": [

                         {

                             "name": "import",

                             "description": "",

                             "parameters": [],

                             "results": [

                                 {

                                     "name": import_test_name + " average",

                                     "description": "",

                                     "unit": "ms",

                                     "dblValue": execution_time,

                                 }

                             ],

                         }

                     ],

                 }

             ]

         }

     )

     # Write to json file

     logging.debug("Opening jenkins_bench json output file for writing")

     file_jenkins_open = open(output_file_jenkins, "w")

     logging.info("Writing to jenkins_bench json file: " + output_file_jenkins)

     file_jenkins_open.write(jenkins_bench_json)

 if "output" in destinations:

     logging.info("Printing query results to output")

     print(result_json)

run_benchmark_import.json_format_handler
def json_format_handler
Definition: run_benchmark_import.py:47

join
std::string join(T const &container, std::string const &delim)
Definition: StringTransform.h:63

run_benchmark_import.get_connection
def get_connection
Definition: run_benchmark_import.py:15

heavyai::open
int open(const char *path, int flags, int mode)
Definition: heavyai_fs.cpp:66