File: /var/www/vhost/disk-apps/demo.sports-crowd.com/app/Console/myspider.py
import scrapy
import pymysql.cursors
import pymysql
import sys
class BlogSpider(scrapy.Spider):
name = 'getting_prices'
string_product = ''
# Cosntructor
def __init__(self, word = '', truncate = '', user = '', passw ='', host = '', db = ''):
# Se valida si se envio una palabra de busqueda. Caso contrario informa error.
if word == '':
sys.exit("No se ha indicado una palabra para la búsqueda")
self.string_product = word
self.conn = pymysql.connect(host = host, port=3306, user = user, passw = passw, db = db, charset='utf8')
self.cur = self.conn.cursor()
self.clearWordDB(word, truncate)
# Inicio de peticiones
def start_requests(self):
all_uri = ['https://www.tiendasjumbo.co/busca/?ft=<word>',
'https://www.drogueriascafam.com.co/buscar?search_query=<word>&controller=search',
'https://www.larebajavirtual.com/catalogo/buscar?busqueda=<word>',
'https://www.locatelcolombia.com/<word>',
'https://www.drogueriasanjorge.com/?s=<word>&post_type=product&product_cat=0',
'https://www.colsubsidioenlinea.com/tienda/SearchDisplay?storeId=10151&catalogId=10051&langId=-5&pageSize=12&beginIndex=0&searchSource=Q&sType=SimpleSearch&resultCatEntryType=2&showResultsPage=true&pageView=image&searchTerm=<word>']
for u in all_uri:
url = u.replace('<word>', self.string_product)
yield self.make_requests_from_url(url)
# Se extrae frase para conocer desde donde se hace el llamado en esta peticion y como se debe procesar.
def parse(self, response):
if "tiendasjumbo" in response.request.url:
return self.workForTiendasJumbo(response, "TiendasJumbo")
elif "cafam" in response.request.url:
return self.workForCafam(response, "Cafam")
elif "larebaja" in response.request.url:
return self.workForLaRebaja(response, "LaRebaja")
elif "locatel" in response.request.url:
return self.workForLocatel(response, "Locatel")
elif "sanjorge" in response.request.url:
return self.workForSanJorge(response, "SanJorge")
elif "colsubsidio" in response.request.url:
return self.workForColsubsidio(response, "Colsubsidio")
else:
return
# Procesamiento para pagina colsubsidio
def workForColsubsidio(self, response, store):
for obj in response.css('td.item'):
text = obj.css('div.description_fixedwidth > a::text').extract_first()
price = obj.css('div.price.offerprice.bold::text').extract_first()
text = self.formatText(text)
dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
self.insertDB(dict)
# Procesamiento para pagina locatel
def workForLocatel(self, response, store):
store = 'Locatel'
for obj in response.css('div.mainRelatedProdContainer'):
text = obj.css('h3.nameProd > a > p::text').extract_first()
price = obj.css('p.priceProd::text').extract_first()
dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
self.insertDB(dict)
# Procesamiento para pagina san jorge
def workForSanJorge(self, response, store):
store = 'DrogueriaSanJorge'
for obj in response.css('div.product-information'):
text = obj.css('h3.product-title > a ::text').extract_first()
price = obj.css('span.woocommerce-Price-amount.amount::text').extract_first()
dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
self.insertDB(dict)
# Procesamiento para pagina cafam
def workForCafam(self, response, store):
store = 'Cafam'
for obj in response.css('div.product-container'):
text = obj.css('a.product-name > span.list-name ::text').extract_first()
price = obj.css('span.price.product-price::text').extract_first()
dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
self.insertDB(dict)
# Procesamiento para pagina la rebaja
def workForLaRebaja(self, response, store):
store = 'LaRebaja'
for obj in response.css('div.content_product'):
text = obj.css('div.descripcion-lineal ::text').extract_first().strip() + ' ' + obj.css('div.line-bottom > div.descripcion-lineal ::text').extract_first().strip()
price = obj.css('div.price::text').extract_first()
dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
self.insertDB(dict)
# Procesamiento para pagina tiendas jumbo
def workForTiendasJumbo(self, response, store):
store = 'TiendasJumbo'
for obj in response.css('div.product-item__bottom'):
text = obj.css('a.product-item__name > span ::text').extract_first()
price = obj.css('span.product-prices__value.product-prices__value--best-price::text').extract_first()
dict = {'word': self.string_product, 'word_store': text, 'price_store': price, 'store': store}
self.insertDB(dict)
# Se realiza proceso de limpieza de la palabra en la BD, si el parametro truncate llega en X se hace un truncado de la tabla.
def clearWordDB(self, word, truncate):
sql = "DELETE FROM results WHERE word = %s"
self.cur.execute(sql, word)
self.conn.commit()
if truncate == "X":
sql = "TRUNCATE TABLE results"
self.cur.execute(sql)
self.conn.commit()
# Se insertan datos en la BD y tabla indicadas.
def insertDB(self, item):
if item['word_store'] != None and item['price_store'] != None:
item['price_store'] = self.formatPrice(item['price_store'])
item['word_store'] = self.formatText(item['word_store'])
sql = "INSERT INTO results(word, word_store, price_store, store) VALUES (%s, %s, %s, %s)"
self.cur.execute(sql, (item['word'], item['word_store'], item['price_store'], item['store']) )
self.conn.commit()
# Se formate texto para remover, leading spaces
def formatText(self, text):
text = text.strip()
return text
# Se da formato al precio para extraer el valor sin simbolos especiales
def formatPrice(self, price):
# Verificar si el numero posee cents y remover
price = price.strip()
price = price.replace(" ", "")
if price[-3:] == ',00' or price[-3:] == '.00':
price = price[:-3]
# Remover el signo $
if "$" in price:
price = price.replace('$','')
# Remover todos los '.'
if "." in price:
price = price.replace('.','')
# Remover todas las ','
if "," in price:
price = price.replace(',','')
return price
# Finaliza y cierra conexión con BD.
def endDB(self):
self.cur.close()
self.conn.close()