HEX
Server: Apache/2.4.41 (Ubuntu)
System: Linux ip-172-31-42-149 5.15.0-1084-aws #91~20.04.1-Ubuntu SMP Fri May 2 07:00:04 UTC 2025 aarch64
User: ubuntu (1000)
PHP: 7.4.33
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare,
Upload Files
File: /var/www/vhost/disk-apps/agile-selling-mia/app/Console/myspider.py
import scrapy
import pymysql.cursors
import pymysql
import sys

class BlogSpider(scrapy.Spider):
    name = 'getting_prices'
    string_product = ''

# Cosntructor
    def __init__(self, word = '', truncate = '', user = '', passw ='', host = '', db = ''):
# Se valida si se envio una palabra de busqueda. Caso contrario informa error.
        if word == '': 
            sys.exit("No se ha indicado una palabra para la búsqueda")
        

        self.string_product = word
        self.conn = pymysql.connect(host = host, port=3306, user = user, passw = passw, db = db, charset='utf8')
        self.cur = self.conn.cursor()
        
        self.clearWordDB(word, truncate)

# Inicio de peticiones
    def start_requests(self):
        all_uri = ['https://www.tiendasjumbo.co/busca/?ft=<word>',
                    'https://www.drogueriascafam.com.co/buscar?search_query=<word>&controller=search',
                    'https://www.larebajavirtual.com/catalogo/buscar?busqueda=<word>',
                    'https://www.locatelcolombia.com/<word>',
                    'https://www.drogueriasanjorge.com/?s=<word>&post_type=product&product_cat=0',
                    'https://www.colsubsidioenlinea.com/tienda/SearchDisplay?storeId=10151&catalogId=10051&langId=-5&pageSize=12&beginIndex=0&searchSource=Q&sType=SimpleSearch&resultCatEntryType=2&showResultsPage=true&pageView=image&searchTerm=<word>']

        for u in all_uri:
            url = u.replace('<word>', self.string_product)
            yield self.make_requests_from_url(url)

# Se extrae frase para conocer desde donde se hace el llamado en esta peticion y como se debe procesar.
    def parse(self, response):
        if "tiendasjumbo" in response.request.url:
            return self.workForTiendasJumbo(response, "TiendasJumbo")
        elif "cafam" in response.request.url:
            return self.workForCafam(response, "Cafam")
        elif "larebaja" in response.request.url:
            return self.workForLaRebaja(response, "LaRebaja")
        elif "locatel" in response.request.url:
            return self.workForLocatel(response, "Locatel")
        elif "sanjorge" in response.request.url:
            return self.workForSanJorge(response, "SanJorge")
        elif "colsubsidio" in response.request.url:
            return self.workForColsubsidio(response, "Colsubsidio")
        else:
            return

# Procesamiento para pagina colsubsidio        
    def workForColsubsidio(self, response, store):
        for obj in response.css('td.item'):
            text =  obj.css('div.description_fixedwidth > a::text').extract_first()
            price = obj.css('div.price.offerprice.bold::text').extract_first()
            text = self.formatText(text)
            dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
            self.insertDB(dict)

# Procesamiento para pagina locatel
    def workForLocatel(self, response, store):
        store = 'Locatel'
        for obj in response.css('div.mainRelatedProdContainer'):
            text =  obj.css('h3.nameProd > a > p::text').extract_first()
            price = obj.css('p.priceProd::text').extract_first()
            dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
            self.insertDB(dict)

# Procesamiento para pagina san jorge
    def workForSanJorge(self, response, store):
        store = 'DrogueriaSanJorge'
        for obj in response.css('div.product-information'):
            text =  obj.css('h3.product-title > a ::text').extract_first()
            price = obj.css('span.woocommerce-Price-amount.amount::text').extract_first()
            dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
            self.insertDB(dict)

# Procesamiento para pagina cafam
    def workForCafam(self, response, store):
        store = 'Cafam'
        for obj in response.css('div.product-container'):
            text =  obj.css('a.product-name > span.list-name ::text').extract_first()
            price = obj.css('span.price.product-price::text').extract_first()
            dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
            self.insertDB(dict)            

# Procesamiento para pagina la rebaja    
    def workForLaRebaja(self, response, store):
        store = 'LaRebaja'
        for obj in response.css('div.content_product'):
            text =  obj.css('div.descripcion-lineal ::text').extract_first().strip() + ' ' + obj.css('div.line-bottom > div.descripcion-lineal ::text').extract_first().strip()
            price = obj.css('div.price::text').extract_first()
            dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
            self.insertDB(dict)

# Procesamiento para pagina tiendas jumbo
    def workForTiendasJumbo(self, response, store):
        store = 'TiendasJumbo'
        for obj in response.css('div.product-item__bottom'):
            text =  obj.css('a.product-item__name > span ::text').extract_first()
            price = obj.css('span.product-prices__value.product-prices__value--best-price::text').extract_first()
            dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
            self.insertDB(dict)

# Se realiza proceso de limpieza de la palabra en la BD, si el parametro truncate llega en X se hace un truncado de la tabla.
    def clearWordDB(self, word, truncate):
        sql = "DELETE FROM results WHERE word = %s"
        self.cur.execute(sql, word)
        self.conn.commit()

        if truncate == "X":
            sql = "TRUNCATE TABLE results"
            self.cur.execute(sql)
            self.conn.commit()

# Se insertan datos en la BD y tabla indicadas.
    def insertDB(self, item):
        if item['word_store'] != None and item['price_store'] != None:
            item['price_store'] = self.formatPrice(item['price_store'])
            item['word_store'] = self.formatText(item['word_store'])

            sql = "INSERT INTO results(word, word_store, price_store, store) VALUES (%s, %s, %s, %s)"
            self.cur.execute(sql, (item['word'], item['word_store'], item['price_store'], item['store']) )
            self.conn.commit()

# Se formate texto para remover, leading spaces
    def formatText(self, text):
        text = text.strip()

        return text

# Se da formato al precio para extraer el valor sin simbolos especiales
    def formatPrice(self, price):
        # Verificar si el numero posee cents y remover
        price = price.strip()
        price = price.replace(" ", "")

        if price[-3:] == ',00' or price[-3:] == '.00':
            price = price[:-3]
        
# Remover el signo $
        if "$" in price:
            price = price.replace('$','')

# Remover todos los '.'
        if "." in price:
            price = price.replace('.','')

# Remover todas las ','
        if "," in price:
            price = price.replace(',','')
        
        return price
    
# Finaliza y cierra conexión con BD.
    def endDB(self):
        self.cur.close()
        self.conn.close()